Skip to content

Commit 02ea4c4

Browse files
authored
[EC] P-256/384/521 s2n-bignum scalar multiplication (#2036)
For curves P-256/384/521 we use s2n-bignum implementation of scalar multiplication of an arbitrary point. This gives the following performance improvements (measurements in ops/s): ``` __Apple M1__| before | after | speedup | P-256 MUL | 27871 | 31607 | 1.13x | P-256 ECDH | 20804 | 22778 | 1.11x | P-384 MUL | 7245 | 8618 | 1.19x | P-384 ECDH | 5367 | 5986 | 1.11x | P-521 MUL | 5040 | 5806 | 1.15x | P-521 ECDH | 3696 | 4053 | 1.10x | ____Intel___| before | after | speedup | P-256 MUL | 21913 | 25650 | 1.17x | P-256 ECDH | 17188 | 19453 | 1.13x | P-384 MUL | 6554 | 7691 | 1.17x | P-384 ECDH | 4731 | 5321 | 1.12x | P-521 MUL | 4400 | 5151 | 1.17x | P-521 ECDH | 3192 | 3514 | 1.10x | ``` where Apple M1 is a M1 based macbook laptop, and Intel is Intel(R) Xeon(R) Platinum 8488C.
1 parent 850af98 commit 02ea4c4

File tree

7 files changed

+621
-600
lines changed

7 files changed

+621
-600
lines changed

crypto/fipsmodule/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,9 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
205205
set(
206206
S2N_BIGNUM_ASM_SOURCES
207207

208+
p256/p256_montjscalarmul.S
209+
p256/p256_montjscalarmul_alt.S
210+
208211
p384/bignum_add_p384.S
209212
p384/bignum_sub_p384.S
210213
p384/bignum_neg_p384.S
@@ -218,6 +221,8 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
218221
p384/bignum_littleendian_6.S
219222
p384/p384_montjdouble.S
220223
p384/p384_montjdouble_alt.S
224+
p384/p384_montjscalarmul.S
225+
p384/p384_montjscalarmul_alt.S
221226

222227
p521/bignum_add_p521.S
223228
p521/bignum_sub_p521.S
@@ -230,6 +235,8 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
230235
p521/bignum_fromlebytes_p521.S
231236
p521/p521_jdouble.S
232237
p521/p521_jdouble_alt.S
238+
p521/p521_jscalarmul.S
239+
p521/p521_jscalarmul_alt.S
233240

234241
curve25519/bignum_mod_n25519.S
235242
curve25519/bignum_neg_p25519.S

crypto/fipsmodule/ec/p256-nistz.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@
3131
#include "../../internal.h"
3232
#include "internal.h"
3333
#include "p256-nistz.h"
34+
#include "ec_nistp.h"
35+
36+
#if defined(EC_NISTP_USE_S2N_BIGNUM)
37+
#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
38+
#endif
3439

3540
#if !defined(OPENSSL_NO_ASM) && \
3641
(defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
@@ -304,6 +309,13 @@ static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) {
304309
static void ecp_nistz256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
305310
const EC_JACOBIAN *p,
306311
const EC_SCALAR *scalar) {
312+
#if defined(EC_NISTP_USE_S2N_BIGNUM)
313+
ec_nistp_felem_limb in[P256_LIMBS * 3];
314+
ec_nistp_felem_limb out[P256_LIMBS * 3];
315+
ec_nistp_coordinates_to_point(in, p->X.words, p->Y.words, p->Z.words, P256_LIMBS);
316+
p256_montjscalarmul_selector(out, scalar->words, in);
317+
ec_nistp_point_to_coordinates(r->X.words, r->Y.words, r->Z.words, out, P256_LIMBS);
318+
#else
307319
stack_align_type buffer_out[32 + sizeof(P256_POINT)];
308320
P256_POINT *aligned_out = (P256_POINT *) align_pointer(buffer_out, 32);
309321
ecp_nistz256_windowed_mul(group, aligned_out, p, scalar);
@@ -312,6 +324,7 @@ static void ecp_nistz256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
312324
OPENSSL_memcpy(r->X.words, aligned_out->X, P256_LIMBS * sizeof(BN_ULONG));
313325
OPENSSL_memcpy(r->Y.words, aligned_out->Y, P256_LIMBS * sizeof(BN_ULONG));
314326
OPENSSL_memcpy(r->Z.words, aligned_out->Z, P256_LIMBS * sizeof(BN_ULONG));
327+
#endif
315328
}
316329

317330
static void ecp_nistz256_point_mul_base(const EC_GROUP *group, EC_JACOBIAN *r,

crypto/fipsmodule/ec/p384.c

Lines changed: 4 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -438,39 +438,6 @@ static int ec_GFp_nistp384_cmp_x_coordinate(const EC_GROUP *group,
438438
return 0;
439439
}
440440

441-
// ----------------------------------------------------------------------------
442-
// SCALAR MULTIPLICATION OPERATIONS
443-
// ----------------------------------------------------------------------------
444-
//
445-
// The method for computing scalar products in functions:
446-
// - |ec_GFp_nistp384_point_mul|,
447-
// - |ec_GFp_nistp384_point_mul_base|,
448-
// - |ec_GFp_nistp384_point_mul_public|,
449-
// is adapted from ECCKiila project (https://arxiv.org/abs/2007.11481).
450-
//
451-
// One difference from the processing in the ECCKiila project is the order of
452-
// the digit processing in |ec_GFp_nistp384_point_mul_base|, where we end the
453-
// processing with the least significant digit to be able to apply the
454-
// analysis results detailed at the bottom of this file. In
455-
// |ec_GFp_nistp384_point_mul_base| and |ec_GFp_nistp384_point_mul|, we
456-
// considered using window size 7 based on that same analysis. However, the
457-
// table size and performance measurements were more preferable for window
458-
// size 5. The potential issue with different window sizes is that for some
459-
// sizes, a scalar can be found such that a case of point doubling instead of
460-
// point addition happens in the scalar multiplication. This would make
461-
// the multiplication non constant-time. To the best of our knowledge this
462-
// timing leak is not an exploitable issue because the only scalar for which
463-
// the leak can happen is already known by the attacker. This is also provided
464-
// that this recoding and window size are only used with ECDH and ECDSA
465-
// protocols. Any other use would need to be analyzed to determine whether it is
466-
// secure and the user should be aware of this side channel of a particular
467-
// scalar value.
468-
//
469-
// OpenSSL has a similar analysis for P-521 implementation:
470-
// https://github.com/openssl/openssl/blob/e9492d1cecf459261f1f5ac0eb03e9c631600537/crypto/ec/ecp_nistp521.c#L1318
471-
//
472-
// For detailed analysis of different window sizes see the bottom of this file.
473-
474441
// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
475442
static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
476443
const EC_JACOBIAN *p,
@@ -482,7 +449,11 @@ static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
482449
p384_from_generic(tmp[1], &p->Y);
483450
p384_from_generic(tmp[2], &p->Z);
484451

452+
#if defined(EC_NISTP_USE_S2N_BIGNUM)
453+
p384_montjscalarmul_selector((uint64_t*)res, scalar->words, (uint64_t*)tmp);
454+
#else
485455
ec_nistp_scalar_mul(p384_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
456+
#endif
486457

487458
p384_to_generic(&r->X, res[0]);
488459
p384_to_generic(&r->Y, res[1]);

crypto/fipsmodule/ec/p521.c

Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -377,37 +377,6 @@ static void ec_GFp_nistp521_dbl(const EC_GROUP *group, EC_JACOBIAN *r,
377377
p521_to_generic(&r->Z, z);
378378
}
379379

380-
// ----------------------------------------------------------------------------
381-
// SCALAR MULTIPLICATION OPERATIONS
382-
// ----------------------------------------------------------------------------
383-
//
384-
// The method for computing scalar products in functions:
385-
// - |ec_GFp_nistp521_point_mul|,
386-
// - |ec_GFp_nistp521_point_mul_base|,
387-
// - |ec_GFp_nistp521_point_mul_public|,
388-
// is adapted from ECCKiila project (https://arxiv.org/abs/2007.11481).
389-
// The main difference is that we use a window of size 7 instead of 5 for the
390-
// first two functions. The potential issue with window sizes is that for some
391-
// sizes a scalar can be found such that a case of point doubling instead of
392-
// point addition happens in the scalar multiplication. This would make the
393-
// multiplication non constant-time. Therefore, such window sizes have to be
394-
// avoided. The windows size of 7 is chosen based on analysis analogous to
395-
// the one in |ec_GFp_nistp_recode_scalar_bits| function in |util.c| file.
396-
// See the analysis at the bottom of this file.
397-
//
398-
// Moreover, the order in which the digits of the scalar are processed in
399-
// |ec_GFp_nistp521_point_mul_base| is different from the ECCKiila project, to
400-
// ensure that the least significant digit is processed last which together
401-
// with the window size 7 guarantees constant-time execution of the function.
402-
//
403-
// Another difference is that in |ec_GFp_nistp521_point_mul_public| function we
404-
// use window size 5 for the public point and 7 for the base point. Here it is
405-
// ok to use window of size 5 since the scalar is public and therefore the
406-
// function doesn't have to be constant-time.
407-
//
408-
// The precomputed table of base point multiples is generated by the code in
409-
// |make_tables.go| script.
410-
411380
// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
412381
static void ec_GFp_nistp521_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
413382
const EC_JACOBIAN *p,
@@ -419,7 +388,11 @@ static void ec_GFp_nistp521_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
419388
p521_from_generic(tmp[1], &p->Y);
420389
p521_from_generic(tmp[2], &p->Z);
421390

391+
#if defined(EC_NISTP_USE_S2N_BIGNUM)
392+
p521_jscalarmul_selector((uint64_t*)res, scalar->words, (uint64_t*)tmp);
393+
#else
422394
ec_nistp_scalar_mul(p521_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
395+
#endif
423396

424397
p521_to_generic(&r->X, res[0]);
425398
p521_to_generic(&r->Y, res[1]);

third_party/s2n-bignum/include/s2n-bignum_aws-lc.h

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,13 @@ static inline uint8_t use_s2n_bignum_alt(void) {
5656
}
5757
#endif
5858

59+
extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]);
60+
extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]);
61+
static inline void p256_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]) {
62+
if (use_s2n_bignum_alt()) { p256_montjscalarmul_alt(res, scalar, point); }
63+
else { p256_montjscalarmul(res, scalar, point); }
64+
}
65+
5966
// Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
6067
// Inputs x[6], y[6]; output z[6]
6168
extern void bignum_add_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
@@ -110,6 +117,13 @@ static inline void p384_montjdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 18],u
110117
else { p384_montjdouble(p3, p1); }
111118
}
112119

120+
extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]);
121+
extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]);
122+
static inline void p384_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]) {
123+
if (use_s2n_bignum_alt()) { p384_montjscalarmul_alt(res, scalar, point); }
124+
else { p384_montjscalarmul(res, scalar, point); }
125+
}
126+
113127
// Convert 6-digit (384-bit) bignum from little-endian form
114128
// Input x[6]; output z[6]
115129
extern void bignum_fromlebytes_6(uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
@@ -158,12 +172,18 @@ extern void bignum_fromlebytes_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint8
158172
// Convert 9-digit 528-bit bignum to little-endian bytes
159173
extern void bignum_tolebytes_p521(uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]);
160174

161-
extern void p521_jdouble(uint64_t p3[static 27],uint64_t p1[static 27]);
162-
extern void p521_jdouble_alt(uint64_t p3[static 27],uint64_t p1[static 27]);
175+
extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]);
176+
extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]);
163177
static inline void p521_jdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]) {
164178
if (use_s2n_bignum_alt()) { p521_jdouble_alt(p3, p1); }
165179
else { p521_jdouble(p3, p1); }
166180
}
181+
extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]);
182+
extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]);
183+
static inline void p521_jscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]) {
184+
if (use_s2n_bignum_alt()) { p521_jscalarmul_alt(res, scalar, point); }
185+
else { p521_jscalarmul(res, scalar, point); }
186+
}
167187

168188
// curve25519_x25519_byte and curve25519_x25519_byte_alt computes the x25519
169189
// function specified in https://www.rfc-editor.org/rfc/rfc7748. |scalar| is the

util/fipstools/delocate/delocate.peg

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,15 +114,19 @@ ARMPostincrement <- '!'
114114
BaseIndexScale <- '(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)? )? ')'
115115
Operator <- [+\-]
116116
OffsetOperator <- '+' / '-' / '*'
117+
# s2n-bignum code has a lot of different and complex ways to compute an offset.
118+
# For example, (7*72)+(3*72)*(5-1)+8+0*72. We define S2nBignumHelper in an attempt
119+
# to simplofy the expressions for Offset.
120+
S2nBignumHelper <- '(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' WS? OffsetOperator? WS?
117121
Offset <- '+'? '-'? (("0b" [01]+) /
118122
("0x" [[0-9A-F]]+) /
119123
([0-9]+ WS OffsetOperator [0-9]+ /
120124
[0-9]+ ( OffsetOperator '(' [0-9]+ OffsetOperator [0-9]+ ')' )? /
121125
[0-9]+ ( OffsetOperator [0-9]+ OffsetOperator [0-9]+ )? /
122126
[0-9]+ ( OffsetOperator [0-9]+ )? /
123-
'(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' OffsetOperator [0-9]+ OffsetOperator [0-9]+ /
124-
'(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' OffsetOperator [0-9]+ !'x' /
125-
'(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' /
127+
S2nBignumHelper S2nBignumHelper (S2nBignumHelper ([0-9]+ OffsetOperator)? [0-9]+ OffsetOperator)? [0-9]+ /
128+
S2nBignumHelper [0-9]+ ((WS? OffsetOperator [0-9]+ (WS? OffsetOperator [0-9]+)?) / (!'x')) /
129+
S2nBignumHelper /
126130
'(' [0-9]+ WS? OffsetOperator WS? [0-9]+ WS? OffsetOperator WS? [0-9]+')')![[A-Z]]
127131
)
128132
Section <- [[A-Z@]]+

0 commit comments

Comments
 (0)