Skip to content

Commit 1bbe409

Browse files
committed
Minor spacing fixes and more comments on AddBigit and MulAddBigit.
1 parent 4fc8806 commit 1bbe409

File tree

2 files changed

+30
-2
lines changed

2 files changed

+30
-2
lines changed

CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,6 @@ if (GOOGLETEST_ROOT)
228228
src/s2/thread_testing.cc)
229229
endif()
230230

231-
232231
target_link_libraries(
233232
s2
234233
absl::absl_vlog_is_on

src/s2/util/math/exactfloat/bignum.cc

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,22 @@ Bignum Bignum::Pow(int32_t pow) const {
278278

279279
// Computes a + b + carry and updates the carry.
280280
inline Bigit AddBigit(Bigit a, Bigit b, Bigit* absl_nonnull carry) {
281+
// Compilers such as GCC and Clang are known to be terrible at generating good
282+
// code for long carry chains on Intel. Using the _addcarry_u64 intrinsic (which
283+
// maps to the add/adc instructions produces a tight series of add/adc/adc/adc
284+
// instructions, whereas writing the loop manually often generates many add/adc
285+
// pairs with spurious bit twiddling.
286+
//
287+
// Using the intrinsic here improves benchmarks by ~30% when summing larger
288+
// Bignums together.
289+
//
290+
// See this SO discussion for more information:
291+
// https://stackoverflow.com/questions/33690791
292+
//
293+
// Godbolt link for comparison:
294+
// https://godbolt.org/z/cGnnfMbMn
281295
#ifdef __x86_64__
296+
static_assert(sizeof(Bigit) == sizeof(unsigned long long));
282297
Bigit out;
283298
*carry =
284299
_addcarry_u64(*carry, a, b, reinterpret_cast<unsigned long long*>(&out));
@@ -295,6 +310,7 @@ inline Bigit AddBigit(Bigit a, Bigit b, Bigit* absl_nonnull carry) {
295310
// NOTE: Borrow must be one or zero.
296311
inline Bigit SubBigit(Bigit a, Bigit b, Bigit* absl_nonnull borrow) {
297312
ABSL_DCHECK_LE(*borrow, Bigit(1));
313+
// See notes in AddBigit on why using an intrinsic is the right choice here.
298314
#ifdef __x86_64__
299315
Bigit out;
300316
*borrow = _subborrow_u64(*borrow, a, b,
@@ -319,6 +335,18 @@ inline Bigit MulBigit(Bigit a, Bigit b, Bigit* absl_nonnull carry) {
319335
// NOTE: Will not overflow even if a, b, and c are their maximum values.
320336
inline void MulAddBigit(Bigit* absl_nonnull sum, Bigit a, Bigit b,
321337
Bigit* absl_nonnull carry) {
338+
// Similar to the comment in AddBigit, and just for completeness, it's worth
339+
// noting that the "best" way to implement this is with the Intel MULX, ADCQ,
340+
// and ADOQ instructions (i.e. the _mulx_u64, _addcarry_u64, and
341+
// _addcarryx_u64 intrinsics), but GCC and Clang do not support _addcarryx_u64
342+
// properly (and have no plans to do so). The issue is that gcc doesn't
343+
// support reasoning about separate dependency chains for the carry and
344+
// overflow flags, because all the flags are considered to be one
345+
// register. (The ADOX instructions were added specifically for this use case,
346+
// i.e. high-precision integer multiplies. They propagate carries using the
347+
// overflow flag rather than the carry flag, which lets you do two
348+
// extended-precision add operations in parallel without having them stomp on
349+
// each other's carry flags. )
322350
auto term = absl::uint128(a) * b + *carry + *sum;
323351
*carry = absl::Uint128High64(term);
324352
*sum = static_cast<Bigit>(term);
@@ -338,7 +366,8 @@ inline void MulAddBigit(Bigit* absl_nonnull sum, Bigit a, Bigit b,
338366
//
339367
// Rather than having to expand A to B.bigits_.size() + 1, and popping off the
340368
// top bigit if it's unused (which is the most common case).
341-
inline Bigit AddInPlace(absl::Span<Bigit> a, absl::Span<const Bigit> b) {
369+
ABSL_ATTRIBUTE_NOINLINE inline Bigit AddInPlace(absl::Span<Bigit> a,
370+
absl::Span<const Bigit> b) {
342371
ABSL_DCHECK_GE(a.size(), b.size());
343372

344373
Bigit carry = 0;

0 commit comments

Comments
 (0)