[EC] P-256/384/521 s2n-bignum scalar multiplication (#2036)

dkostic · web-flow · commit 02ea4c4af750 · 2024-12-13T11:43:18.000-05:00
For curves P-256/384/521 we use s2n-bignum implementation
of scalar multiplication of an arbitrary point. This gives the following
performance improvements (measurements in ops/s):
```
__Apple M1__| before |  after | speedup |
P-256 MUL   |  27871 |  31607 |  1.13x  |
P-256 ECDH  |  20804 |  22778 |  1.11x  |
P-384 MUL   |   7245 |   8618 |  1.19x  |
P-384 ECDH  |   5367 |   5986 |  1.11x  |
P-521 MUL   |   5040 |   5806 |  1.15x  |
P-521 ECDH  |   3696 |   4053 |  1.10x  |

____Intel___| before |  after | speedup |
P-256 MUL   |  21913 |  25650 |  1.17x  |
P-256 ECDH  |  17188 |  19453 |  1.13x  |
P-384 MUL   |   6554 |   7691 |  1.17x  |
P-384 ECDH  |   4731 |   5321 |  1.12x  |
P-521 MUL   |   4400 |   5151 |  1.17x  |
P-521 ECDH  |   3192 |   3514 |  1.10x  |
```
where Apple M1 is a M1 based macbook laptop, and
Intel is Intel(R) Xeon(R) Platinum 8488C.
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
@@ -205,6 +205,9 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
   set(
     S2N_BIGNUM_ASM_SOURCES
 
+    p256/p256_montjscalarmul.S
+    p256/p256_montjscalarmul_alt.S
+
     p384/bignum_add_p384.S
     p384/bignum_sub_p384.S
     p384/bignum_neg_p384.S
@@ -218,6 +221,8 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
     p384/bignum_littleendian_6.S
     p384/p384_montjdouble.S
     p384/p384_montjdouble_alt.S
+    p384/p384_montjscalarmul.S
+    p384/p384_montjscalarmul_alt.S
 
     p521/bignum_add_p521.S
     p521/bignum_sub_p521.S
@@ -230,6 +235,8 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
     p521/bignum_fromlebytes_p521.S
     p521/p521_jdouble.S
     p521/p521_jdouble_alt.S
+    p521/p521_jscalarmul.S
+    p521/p521_jscalarmul_alt.S
 
     curve25519/bignum_mod_n25519.S
     curve25519/bignum_neg_p25519.S
diff --git a/crypto/fipsmodule/ec/p256-nistz.c b/crypto/fipsmodule/ec/p256-nistz.c
@@ -31,6 +31,11 @@
 #include "../../internal.h"
 #include "internal.h"
 #include "p256-nistz.h"
+#include "ec_nistp.h"
+
+#if defined(EC_NISTP_USE_S2N_BIGNUM)
+#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
+#endif
 
 #if !defined(OPENSSL_NO_ASM) &&  \
     (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&    \
@@ -304,6 +309,13 @@ static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) {
 static void ecp_nistz256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                    const EC_JACOBIAN *p,
                                    const EC_SCALAR *scalar) {
+#if defined(EC_NISTP_USE_S2N_BIGNUM)
+  ec_nistp_felem_limb in[P256_LIMBS * 3];
+  ec_nistp_felem_limb out[P256_LIMBS * 3];
+  ec_nistp_coordinates_to_point(in, p->X.words, p->Y.words, p->Z.words, P256_LIMBS);
+  p256_montjscalarmul_selector(out, scalar->words, in);
+  ec_nistp_point_to_coordinates(r->X.words, r->Y.words, r->Z.words, out, P256_LIMBS);
+#else
   stack_align_type buffer_out[32 + sizeof(P256_POINT)];
   P256_POINT *aligned_out = (P256_POINT *) align_pointer(buffer_out, 32);
   ecp_nistz256_windowed_mul(group, aligned_out, p, scalar);
@@ -312,6 +324,7 @@ static void ecp_nistz256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
   OPENSSL_memcpy(r->X.words, aligned_out->X, P256_LIMBS * sizeof(BN_ULONG));
   OPENSSL_memcpy(r->Y.words, aligned_out->Y, P256_LIMBS * sizeof(BN_ULONG));
   OPENSSL_memcpy(r->Z.words, aligned_out->Z, P256_LIMBS * sizeof(BN_ULONG));
+#endif
 }
 
 static void ecp_nistz256_point_mul_base(const EC_GROUP *group, EC_JACOBIAN *r,
diff --git a/crypto/fipsmodule/ec/p384.c b/crypto/fipsmodule/ec/p384.c
@@ -438,39 +438,6 @@ static int ec_GFp_nistp384_cmp_x_coordinate(const EC_GROUP *group,
   return 0;
 }
 
-// ----------------------------------------------------------------------------
-//                    SCALAR MULTIPLICATION OPERATIONS
-// ----------------------------------------------------------------------------
-//
-// The method for computing scalar products in functions:
-//   - |ec_GFp_nistp384_point_mul|,
-//   - |ec_GFp_nistp384_point_mul_base|,
-//   - |ec_GFp_nistp384_point_mul_public|,
-// is adapted from ECCKiila project (https://arxiv.org/abs/2007.11481).
-//
-// One difference from the processing in the ECCKiila project is the order of
-// the digit processing in |ec_GFp_nistp384_point_mul_base|, where we end the
-// processing with the least significant digit to be able to apply the
-// analysis results detailed at the bottom of this file. In
-// |ec_GFp_nistp384_point_mul_base| and |ec_GFp_nistp384_point_mul|, we
-// considered using window size 7 based on that same analysis. However, the
-// table size and performance measurements were more preferable for window
-// size 5. The potential issue with different window sizes is that for some
-// sizes, a scalar can be found such that a case of point doubling instead of
-// point addition happens in the scalar multiplication. This would make
-// the multiplication non constant-time. To the best of our knowledge this
-// timing leak is not an exploitable issue because the only scalar for which
-// the leak can happen is already known by the attacker. This is also provided
-// that this recoding and window size are only used with ECDH and ECDSA
-// protocols. Any other use would need to be analyzed to determine whether it is
-// secure and the user should be aware of this side channel of a particular
-// scalar value.
-//
-// OpenSSL has a similar analysis for P-521 implementation:
-// https://github.com/openssl/openssl/blob/e9492d1cecf459261f1f5ac0eb03e9c631600537/crypto/ec/ecp_nistp521.c#L1318
-//
-// For detailed analysis of different window sizes see the bottom of this file.
-
 // Multiplication of an arbitrary point by a scalar, r = [scalar]P.
 static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                       const EC_JACOBIAN *p,
@@ -482,7 +449,11 @@ static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
   p384_from_generic(tmp[1], &p->Y);
   p384_from_generic(tmp[2], &p->Z);
 
+#if defined(EC_NISTP_USE_S2N_BIGNUM)
+  p384_montjscalarmul_selector((uint64_t*)res, scalar->words, (uint64_t*)tmp);
+#else
   ec_nistp_scalar_mul(p384_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
+#endif
 
   p384_to_generic(&r->X, res[0]);
   p384_to_generic(&r->Y, res[1]);
diff --git a/crypto/fipsmodule/ec/p521.c b/crypto/fipsmodule/ec/p521.c
@@ -377,37 +377,6 @@ static void ec_GFp_nistp521_dbl(const EC_GROUP *group, EC_JACOBIAN *r,
   p521_to_generic(&r->Z, z);
 }
 
-// ----------------------------------------------------------------------------
-//                    SCALAR MULTIPLICATION OPERATIONS
-// ----------------------------------------------------------------------------
-//
-// The method for computing scalar products in functions:
-//   - |ec_GFp_nistp521_point_mul|,
-//   - |ec_GFp_nistp521_point_mul_base|,
-//   - |ec_GFp_nistp521_point_mul_public|,
-// is adapted from ECCKiila project (https://arxiv.org/abs/2007.11481).
-// The main difference is that we use a window of size 7 instead of 5 for the
-// first two functions. The potential issue with window sizes is that for some
-// sizes a scalar can be found such that a case of point doubling instead of
-// point addition happens in the scalar multiplication. This would make the
-// multiplication non constant-time. Therefore, such window sizes have to be
-// avoided. The windows size of 7 is chosen based on analysis analogous to
-// the one in |ec_GFp_nistp_recode_scalar_bits| function in |util.c| file.
-// See the analysis at the bottom of this file.
-//
-// Moreover, the order in which the digits of the scalar are processed in
-// |ec_GFp_nistp521_point_mul_base| is different from the ECCKiila project, to
-// ensure that the least significant digit is processed last which together
-// with the window size 7 guarantees constant-time execution of the function.
-//
-// Another difference is that in |ec_GFp_nistp521_point_mul_public| function we
-// use window size 5 for the public point and 7 for the base point. Here it is
-// ok to use window of size 5 since the scalar is public and therefore the
-// function doesn't have to be constant-time.
-//
-// The precomputed table of base point multiples is generated by the code in
-// |make_tables.go| script.
-
 // Multiplication of an arbitrary point by a scalar, r = [scalar]P.
 static void ec_GFp_nistp521_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                       const EC_JACOBIAN *p,
@@ -419,7 +388,11 @@ static void ec_GFp_nistp521_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
   p521_from_generic(tmp[1], &p->Y);
   p521_from_generic(tmp[2], &p->Z);
 
+#if defined(EC_NISTP_USE_S2N_BIGNUM)
+  p521_jscalarmul_selector((uint64_t*)res, scalar->words, (uint64_t*)tmp);
+#else
   ec_nistp_scalar_mul(p521_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
+#endif
 
   p521_to_generic(&r->X, res[0]);
   p521_to_generic(&r->Y, res[1]);
diff --git a/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h b/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h
@@ -56,6 +56,13 @@ static inline uint8_t use_s2n_bignum_alt(void) {
 }
 #endif
 
+extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]);
+static inline void p256_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]) {
+  if (use_s2n_bignum_alt()) { p256_montjscalarmul_alt(res, scalar, point); }
+  else { p256_montjscalarmul(res, scalar, point); }
+}
+
 // Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
 // Inputs x[6], y[6]; output z[6]
 extern void bignum_add_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
@@ -110,6 +117,13 @@ static inline void p384_montjdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 18],u
     else { p384_montjdouble(p3, p1); }
 }
 
+extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]);
+static inline void p384_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]) {
+  if (use_s2n_bignum_alt()) { p384_montjscalarmul_alt(res, scalar, point); }
+  else { p384_montjscalarmul(res, scalar, point); }
+}
+
 // Convert 6-digit (384-bit) bignum from little-endian form
 // Input x[6]; output z[6]
 extern void bignum_fromlebytes_6(uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
@@ -158,12 +172,18 @@ extern void bignum_fromlebytes_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint8
 // Convert 9-digit 528-bit bignum to little-endian bytes
 extern void bignum_tolebytes_p521(uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]);
 
-extern void p521_jdouble(uint64_t p3[static 27],uint64_t p1[static 27]);
-extern void p521_jdouble_alt(uint64_t p3[static 27],uint64_t p1[static 27]);
+extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]);
+extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]);
 static inline void p521_jdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]) {
     if (use_s2n_bignum_alt()) { p521_jdouble_alt(p3, p1); }
     else { p521_jdouble(p3, p1); }
 }
+extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]);
+extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]);
+static inline void p521_jscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]) {
+    if (use_s2n_bignum_alt()) { p521_jscalarmul_alt(res, scalar, point); }
+    else { p521_jscalarmul(res, scalar, point); }
+}
 
 // curve25519_x25519_byte and curve25519_x25519_byte_alt computes the x25519
 // function specified in https://www.rfc-editor.org/rfc/rfc7748. |scalar| is the
diff --git a/util/fipstools/delocate/delocate.peg b/util/fipstools/delocate/delocate.peg
@@ -114,15 +114,19 @@ ARMPostincrement <- '!'
 BaseIndexScale <- '(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)? )? ')'
 Operator <- [+\-]
 OffsetOperator <- '+' / '-' / '*'
+# s2n-bignum code has a lot of different and complex ways to compute an offset.
+# For example, (7*72)+(3*72)*(5-1)+8+0*72. We define S2nBignumHelper in an attempt
+# to simplofy the expressions for Offset.
+S2nBignumHelper <- '(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' WS? OffsetOperator? WS?
 Offset <- '+'? '-'? (("0b" [01]+) /
                      ("0x" [[0-9A-F]]+) /
                      ([0-9]+ WS OffsetOperator [0-9]+ /
                       [0-9]+ ( OffsetOperator '(' [0-9]+ OffsetOperator [0-9]+ ')' )? /
                       [0-9]+ ( OffsetOperator [0-9]+ OffsetOperator [0-9]+ )? /
                       [0-9]+ ( OffsetOperator [0-9]+ )? /
-                      '(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' OffsetOperator [0-9]+ OffsetOperator [0-9]+ /
-                      '(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' OffsetOperator [0-9]+ !'x' /
-                      '(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' /
+                      S2nBignumHelper S2nBignumHelper (S2nBignumHelper ([0-9]+ OffsetOperator)? [0-9]+ OffsetOperator)? [0-9]+ /
+                      S2nBignumHelper [0-9]+ ((WS? OffsetOperator [0-9]+ (WS? OffsetOperator [0-9]+)?) / (!'x')) /
+                      S2nBignumHelper /
                       '(' [0-9]+ WS? OffsetOperator WS? [0-9]+ WS? OffsetOperator WS? [0-9]+')')![[A-Z]]
                     )
 Section <- [[A-Z@]]+
diff --git a/util/fipstools/delocate/delocate.peg.go b/util/fipstools/delocate/delocate.peg.go