Skip to content

Commit b8ee93f

Browse files
authored
Improve: Sparse intersection dependency chain (#251)
The sequence for calculating the start of the next iteration has high latency, so moving it to earlier in the loop improves IPC. Results from Zen 4 suggest ~10% better IPC and throughput across the board. The kernel had a long dependency chain, and `vpbroadcastd` & `vpcmpleud` & `kmovw` have pretty high latencies, especially on Zen 4 (Icelake is a few cycles shorter). With the old code, even passing in `-march=znver4 -mtune=znver4` isn't enough for the compilers to fully move this sequence before the intersection subroutine.
1 parent e682bbd commit b8ee93f

File tree

1 file changed

+14
-14
lines changed

1 file changed

+14
-14
lines changed

include/simsimd/sparse.h

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -435,20 +435,20 @@ SIMSIMD_PUBLIC void simsimd_intersect_u16_ice( //
435435
}
436436
b_min = b_vec.u16[0];
437437

438+
__m512i a_last_broadcasted = _mm512_set1_epi16(*(short const *)&a_max);
439+
__m512i b_last_broadcasted = _mm512_set1_epi16(*(short const *)&b_max);
440+
__mmask32 a_step_mask = _mm512_cmple_epu16_mask(a_vec.zmm, b_last_broadcasted);
441+
__mmask32 b_step_mask = _mm512_cmple_epu16_mask(b_vec.zmm, a_last_broadcasted);
442+
a += 32 - _lzcnt_u32((simsimd_u32_t)a_step_mask);
443+
b += 32 - _lzcnt_u32((simsimd_u32_t)b_step_mask);
444+
438445
// Now we are likely to have some overlap, so we can intersect the registers
439446
__mmask32 a_matches = _simsimd_intersect_u16x32_ice(a_vec.zmm, b_vec.zmm);
440447

441448
// The paper also contained a very nice procedure for exporting the matches,
442449
// but we don't need it here:
443450
// _mm512_mask_compressstoreu_epi16(c, a_matches, a_vec);
444451
c += _mm_popcnt_u32(a_matches); // MSVC has no `_popcnt32`
445-
446-
__m512i a_last_broadcasted = _mm512_set1_epi16(*(short const *)&a_max);
447-
__m512i b_last_broadcasted = _mm512_set1_epi16(*(short const *)&b_max);
448-
__mmask32 a_step_mask = _mm512_cmple_epu16_mask(a_vec.zmm, b_last_broadcasted);
449-
__mmask32 b_step_mask = _mm512_cmple_epu16_mask(b_vec.zmm, a_last_broadcasted);
450-
a += 32 - _lzcnt_u32((simsimd_u32_t)a_step_mask);
451-
b += 32 - _lzcnt_u32((simsimd_u32_t)b_step_mask);
452452
}
453453

454454
simsimd_intersect_u16_serial(a, b, a_end - a, b_end - b, results);
@@ -500,20 +500,20 @@ SIMSIMD_PUBLIC void simsimd_intersect_u32_ice( //
500500
}
501501
b_min = b_vec.u32[0];
502502

503+
__m512i a_last_broadcasted = _mm512_set1_epi32(*(int const *)&a_max);
504+
__m512i b_last_broadcasted = _mm512_set1_epi32(*(int const *)&b_max);
505+
__mmask16 a_step_mask = _mm512_cmple_epu32_mask(a_vec.zmm, b_last_broadcasted);
506+
__mmask16 b_step_mask = _mm512_cmple_epu32_mask(b_vec.zmm, a_last_broadcasted);
507+
a += 32 - _lzcnt_u32((simsimd_u32_t)a_step_mask);
508+
b += 32 - _lzcnt_u32((simsimd_u32_t)b_step_mask);
509+
503510
// Now we are likely to have some overlap, so we can intersect the registers
504511
__mmask16 a_matches = _simsimd_intersect_u32x16_ice(a_vec.zmm, b_vec.zmm);
505512

506513
// The paper also contained a very nice procedure for exporting the matches,
507514
// but we don't need it here:
508515
// _mm512_mask_compressstoreu_epi32(c, a_matches, a_vec);
509516
c += _mm_popcnt_u32(a_matches); // MSVC has no `_popcnt32`
510-
511-
__m512i a_last_broadcasted = _mm512_set1_epi32(*(int const *)&a_max);
512-
__m512i b_last_broadcasted = _mm512_set1_epi32(*(int const *)&b_max);
513-
__mmask16 a_step_mask = _mm512_cmple_epu32_mask(a_vec.zmm, b_last_broadcasted);
514-
__mmask16 b_step_mask = _mm512_cmple_epu32_mask(b_vec.zmm, a_last_broadcasted);
515-
a += 32 - _lzcnt_u32((simsimd_u32_t)a_step_mask);
516-
b += 32 - _lzcnt_u32((simsimd_u32_t)b_step_mask);
517517
}
518518

519519
simsimd_intersect_u32_serial(a, b, a_end - a, b_end - b, results);

0 commit comments

Comments
 (0)