@@ -263,6 +263,20 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
263263 intersection = _mm512_reduce_add_epi64 (_mm512_add_epi64 (and2_count_vec , and1_count_vec ));
264264 union_ = _mm512_reduce_add_epi64 (_mm512_add_epi64 (or2_count_vec , or1_count_vec ));
265265 } else if (n_words <= 196 ) { // Up to 1568 bits.
266+ // TODO: On such vectors we can clearly see that the CPU struggles to perform this many parallel
267+ // population counts, because the throughput of Jaccard and Hamming in this case starts to differ.
268+ // One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount
269+ // lookups, to utilize other ports on the CPU.
270+ // https://github.com/ashvardanian/SimSIMD/pull/138
271+ //
272+ // On Ice Lake:
273+ // - `VPOPCNTQ (ZMM, K, ZMM)` can only execute on port 5, which is a bottleneck.
274+ // - `VPSHUFB (ZMM, ZMM, ZMM)` can only run on the same port 5 as well!
275+ // On Zen4:
276+ // - `VPOPCNTQ (ZMM, K, ZMM)` can run on ports: 0, 1.
277+ // - `VPSHUFB (ZMM, ZMM, ZMM)` can run on ports: 1, 2.
278+ // https://uops.info/table.html?search=VPOPCNTQ%20(ZMM%2C%20K%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
279+ // https://uops.info/table.html?search=VPSHUFB%20(ZMM%2C%20ZMM%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
266280 __mmask64 mask = (__mmask64 )_bzhi_u64 (0xFFFFFFFFFFFFFFFF , n_words - 128 );
267281 __m512i a1_vec = _mm512_loadu_epi8 (a );
268282 __m512i b1_vec = _mm512_loadu_epi8 (b );
@@ -276,10 +290,10 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
276290 __m512i or2_count_vec = _mm512_popcnt_epi64 (_mm512_or_si512 (a2_vec , b2_vec ));
277291 __m512i and3_count_vec = _mm512_popcnt_epi64 (_mm512_and_si512 (a3_vec , b3_vec ));
278292 __m512i or3_count_vec = _mm512_popcnt_epi64 (_mm512_or_si512 (a3_vec , b3_vec ));
279- intersection =
280- _mm512_reduce_add_epi64 ( _mm512_add_epi64 (and3_count_vec , _mm512_add_epi64 (and2_count_vec , and1_count_vec )));
281- union_ =
282- _mm512_reduce_add_epi64 ( _mm512_add_epi64 (or3_count_vec , _mm512_add_epi64 (or2_count_vec , or1_count_vec )));
293+ intersection = _mm512_reduce_add_epi64 ( //
294+ _mm512_add_epi64 (and3_count_vec , _mm512_add_epi64 (and2_count_vec , and1_count_vec )));
295+ union_ = _mm512_reduce_add_epi64 ( //
296+ _mm512_add_epi64 (or3_count_vec , _mm512_add_epi64 (or2_count_vec , or1_count_vec )));
283297 } else if (n_words <= 256 ) { // Up to 2048 bits.
284298 __mmask64 mask = (__mmask64 )_bzhi_u64 (0xFFFFFFFFFFFFFFFF , n_words - 192 );
285299 __m512i a1_vec = _mm512_loadu_epi8 (a );
0 commit comments