@@ -278,7 +278,22 @@ Bignum Bignum::Pow(int32_t pow) const {
278278
279279// Computes a + b + carry and updates the carry.
280280inline Bigit AddBigit (Bigit a, Bigit b, Bigit* absl_nonnull carry) {
281+ // Compilers such as GCC and Clang are known to be terrible at generating good
282+ // code for long carry chains on Intel. Using the _addcarry_u64 intrinsic (which
283+ // maps to the add/adc instructions produces a tight series of add/adc/adc/adc
284+ // instructions, whereas writing the loop manually often generates many add/adc
285+ // pairs with spurious bit twiddling.
286+ //
287+ // Using the intrinsic here improves benchmarks by ~30% when summing larger
288+ // Bignums together.
289+ //
290+ // See this SO discussion for more information:
291+ // https://stackoverflow.com/questions/33690791
292+ //
293+ // Godbolt link for comparison:
294+ // https://godbolt.org/z/cGnnfMbMn
281295#ifdef __x86_64__
296+ static_assert (sizeof (Bigit) == sizeof (unsigned long long ));
282297 Bigit out;
283298 *carry =
284299 _addcarry_u64 (*carry, a, b, reinterpret_cast <unsigned long long *>(&out));
@@ -295,6 +310,7 @@ inline Bigit AddBigit(Bigit a, Bigit b, Bigit* absl_nonnull carry) {
295310// NOTE: Borrow must be one or zero.
296311inline Bigit SubBigit (Bigit a, Bigit b, Bigit* absl_nonnull borrow) {
297312 ABSL_DCHECK_LE (*borrow, Bigit (1 ));
313+ // See notes in AddBigit on why using an intrinsic is the right choice here.
298314#ifdef __x86_64__
299315 Bigit out;
300316 *borrow = _subborrow_u64 (*borrow, a, b,
@@ -319,6 +335,18 @@ inline Bigit MulBigit(Bigit a, Bigit b, Bigit* absl_nonnull carry) {
319335// NOTE: Will not overflow even if a, b, and c are their maximum values.
320336inline void MulAddBigit (Bigit* absl_nonnull sum, Bigit a, Bigit b,
321337 Bigit* absl_nonnull carry) {
338+ // Similar to the comment in AddBigit, and just for completeness, it's worth
339+ // noting that the "best" way to implement this is with the Intel MULX, ADCQ,
340+ // and ADOQ instructions (i.e. the _mulx_u64, _addcarry_u64, and
341+ // _addcarryx_u64 intrinsics), but GCC and Clang do not support _addcarryx_u64
342+ // properly (and have no plans to do so). The issue is that gcc doesn't
343+ // support reasoning about separate dependency chains for the carry and
344+ // overflow flags, because all the flags are considered to be one
345+ // register. (The ADOX instructions were added specifically for this use case,
346+ // i.e. high-precision integer multiplies. They propagate carries using the
347+ // overflow flag rather than the carry flag, which lets you do two
348+ // extended-precision add operations in parallel without having them stomp on
349+ // each other's carry flags. )
322350 auto term = absl::uint128 (a) * b + *carry + *sum;
323351 *carry = absl::Uint128High64 (term);
324352 *sum = static_cast <Bigit>(term);
@@ -338,7 +366,8 @@ inline void MulAddBigit(Bigit* absl_nonnull sum, Bigit a, Bigit b,
338366//
339367// Rather than having to expand A to B.bigits_.size() + 1, and popping off the
340368// top bigit if it's unused (which is the most common case).
341- inline Bigit AddInPlace (absl::Span<Bigit> a, absl::Span<const Bigit> b) {
369+ ABSL_ATTRIBUTE_NOINLINE inline Bigit AddInPlace (absl::Span<Bigit> a,
370+ absl::Span<const Bigit> b) {
342371 ABSL_DCHECK_GE (a.size (), b.size ());
343372
344373 Bigit carry = 0 ;
0 commit comments