@@ -380,10 +380,9 @@ static const auto kMaskInterleave3 = _mm_set1_epi32(0x33333333);
380380static const auto kMaskInterleave4 = _mm_set1_epi32(0x55555555 );
381381
382382#if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX512
383- static const auto kShuffleMinX512 = _mm512_setr_epi64(0 , 4 , 8 , 12 , 0 , 0 , 0 , 0 );
384- static const auto kShuffleMinY512 = _mm512_setr_epi64(1 , 5 , 9 , 13 , 0 , 0 , 0 , 0 );
385- static const auto kShuffleMaxX512 = _mm512_setr_epi64(2 , 6 , 10 , 14 , 0 , 0 , 0 , 0 );
386- static const auto kShuffleMaxY512 = _mm512_setr_epi64(3 , 7 , 11 , 15 , 0 , 0 , 0 , 0 );
383+ static const auto kPermuteMinXY512 = _mm512_setr_epi64(0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 );
384+ static const auto kPermuteMaxXY512 = _mm512_setr_epi64(2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 );
385+ static const auto kPermuteXLoYHi = _mm256_setr_epi32(0 , 2 , 4 , 6 , 1 , 3 , 5 , 7 );
387386#endif
388387
389388inline __m128i Interleave (__m128i v) {
@@ -1009,10 +1008,17 @@ std::vector<uint32_t> computeHilbertValues(size_t iNumItems,
10091008 auto wIdx = 0UL ;
10101009
10111010#if defined(FLATBUSH_USE_SIMD)
1011+ #if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX
1012+ const auto wHilbertWidth128 = _mm_broadcast_ss (&wHilbertWidth);
1013+ const auto wHilbertHeight128 = _mm_broadcast_ss (&wHilbertHeight);
1014+ const auto wDoubleMinX128 = _mm_broadcast_ss (&wDoubleMinX);
1015+ const auto wDoubleMinY128 = _mm_broadcast_ss (&wDoubleMinY);
1016+ #else
10121017 const auto wHilbertWidth128 = _mm_set1_ps (wHilbertWidth);
10131018 const auto wHilbertHeight128 = _mm_set1_ps (wHilbertHeight);
10141019 const auto wDoubleMinX128 = _mm_set1_ps (wDoubleMinX);
10151020 const auto wDoubleMinY128 = _mm_set1_ps (wDoubleMinY);
1021+ #endif
10161022
10171023 for (; wIdx + 3 < iNumItems; wIdx += 4 ) {
10181024 __m128 wMinX;
@@ -1041,9 +1047,9 @@ std::vector<uint32_t> computeHilbertValues(size_t iNumItems,
10411047}
10421048
10431049template <>
1044- inline std::vector<uint32_t > computeHilbertValues<double >(size_t iNumItems,
1045- const Box<double >& iBounds,
1046- span<Box<double >> iBoxes) {
1050+ std::vector<uint32_t > computeHilbertValues<double >(size_t iNumItems,
1051+ const Box<double >& iBounds,
1052+ span<Box<double >> iBoxes) {
10471053 static constexpr auto kMaxHilbertRatio = 0.5 * std::numeric_limits<uint16_t >::max ();
10481054 const auto wHilbertWidth = kMaxHilbertRatio / (iBounds.mMaxX - iBounds.mMinX );
10491055 const auto wHilbertHeight = kMaxHilbertRatio / (iBounds.mMaxY - iBounds.mMinY );
@@ -1053,25 +1059,36 @@ inline std::vector<uint32_t> computeHilbertValues<double>(size_t iNumItems,
10531059 auto wIdx = 0UL ;
10541060
10551061#if defined(FLATBUSH_USE_SIMD)
1056- #if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX
1057- const auto wHilbertWidth256 = _mm256_set1_pd (wHilbertWidth);
1058- const auto wHilbertHeight256 = _mm256_set1_pd (wHilbertHeight);
1059- const auto wDoubleMinX256 = _mm256_set1_pd (wDoubleMinX);
1060- const auto wDoubleMinY256 = _mm256_set1_pd (wDoubleMinY);
1062+ #if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX512
1063+ const auto wHilbertWidth512 = _mm512_set1_pd (wHilbertWidth);
1064+ const auto wHilbertHeight512 = _mm512_set1_pd (wHilbertHeight);
1065+ const auto wDoubleMinX512 = _mm512_set1_pd (wDoubleMinX);
1066+ const auto wDoubleMinY512 = _mm512_set1_pd (wDoubleMinY);
1067+ const auto wWidthHeight512 = _mm512_mask_blend_pd (0xAA , wHilbertWidth512, wHilbertHeight512);
1068+ const auto wDoubleMinXY512 = _mm512_mask_blend_pd (0xAA , wDoubleMinX512, wDoubleMinY512);
10611069
10621070 for (; wIdx + 3 < iNumItems; wIdx += 4 ) {
1063- #if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX512
10641071 const auto wBoxes01 = _mm512_loadu_pd (&iBoxes[wIdx].mMinX );
10651072 const auto wBoxes23 = _mm512_loadu_pd (&iBoxes[wIdx + 2 ].mMinX );
1066- const auto wMinX =
1067- _mm512_castpd512_pd256 (_mm512_permutex2var_pd (wBoxes01, kShuffleMinX512 , wBoxes23));
1068- const auto wMinY =
1069- _mm512_castpd512_pd256 (_mm512_permutex2var_pd (wBoxes01, kShuffleMinY512 , wBoxes23));
1070- const auto wMaxX =
1071- _mm512_castpd512_pd256 (_mm512_permutex2var_pd (wBoxes01, kShuffleMaxX512 , wBoxes23));
1072- const auto wMaxY =
1073- _mm512_castpd512_pd256 (_mm512_permutex2var_pd (wBoxes01, kShuffleMaxY512 , wBoxes23));
1074- #else
1073+ const auto wMin = _mm512_permutex2var_pd (wBoxes01, kPermuteMinXY512 , wBoxes23);
1074+ const auto wMax = _mm512_permutex2var_pd (wBoxes01, kPermuteMaxXY512 , wBoxes23);
1075+ const auto wResult = _mm256_permutevar8x32_epi32 (
1076+ _mm512_cvtpd_epi32 (_mm512_mul_pd (
1077+ wWidthHeight512, _mm512_sub_pd (_mm512_add_pd (wMin, wMax), wDoubleMinXY512))),
1078+ kPermuteXLoYHi );
1079+ const auto wResultX = _mm256_castsi256_si128 (wResult);
1080+ const auto wResultY = _mm256_extracti32x4_epi32 (wResult, 1 );
1081+
1082+ _mm_storeu_si128 (bit_cast<__m128i*>(&wHilbertValues[wIdx]),
1083+ HilbertXYToIndex (wResultX, wResultY));
1084+ }
1085+ #elif FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX
1086+ const auto wHilbertWidth256 = _mm256_broadcast_sd (&wHilbertWidth);
1087+ const auto wHilbertHeight256 = _mm256_broadcast_sd (&wHilbertHeight);
1088+ const auto wDoubleMinX256 = _mm256_broadcast_sd (&wDoubleMinX);
1089+ const auto wDoubleMinY256 = _mm256_broadcast_sd (&wDoubleMinY);
1090+
1091+ for (; wIdx + 3 < iNumItems; wIdx += 4 ) {
10751092 const auto wBox0 = _mm256_loadu_pd (&iBoxes[wIdx].mMinX );
10761093 const auto wBox1 = _mm256_loadu_pd (&iBoxes[wIdx + 1 ].mMinX );
10771094 const auto wBox2 = _mm256_loadu_pd (&iBoxes[wIdx + 2 ].mMinX );
@@ -1084,7 +1101,6 @@ inline std::vector<uint32_t> computeHilbertValues<double>(size_t iNumItems,
10841101 const auto wMinY = _mm256_permute2f128_pd (wBoxes01Hi, wBoxes23Hi, 0x20 );
10851102 const auto wMaxX = _mm256_permute2f128_pd (wBoxes01Lo, wBoxes23Lo, 0x31 );
10861103 const auto wMaxY = _mm256_permute2f128_pd (wBoxes01Hi, wBoxes23Hi, 0x31 );
1087- #endif
10881104 const auto wSumX = _mm256_add_pd (wMinX, wMaxX);
10891105 const auto wSumY = _mm256_add_pd (wMinY, wMaxY);
10901106 const auto wResultX = _mm256_mul_pd (wHilbertWidth256, _mm256_sub_pd (wSumX, wDoubleMinX256));
@@ -1462,7 +1478,7 @@ void Flatbush<ArrayType>::create(std::vector<Box<ArrayType>>&& iItems) noexcept
14621478 auto wNodeBox = mBoxes [wPosition];
14631479
14641480 // calculate bbox for the new node
1465- for (size_t wCount = 0 ; wCount < wNodeSize && wPosition < wEnd; ++wCount, ++wPosition) {
1481+ for (size_t wCount = 0UL ; wCount < wNodeSize && wPosition < wEnd; ++wCount, ++wPosition) {
14661482 detail::updateBounds (wNodeBox, mBoxes [wPosition]);
14671483 }
14681484
@@ -1611,6 +1627,12 @@ std::vector<size_t> Flatbush<ArrayType>::search(const Box<ArrayType>& iBounds,
16111627
16121628 wNodeIndex = wQueue.back () >> 2U ; // for binary compatibility with JS
16131629 wQueue.pop_back ();
1630+
1631+ #ifdef __GNUC__
1632+ __builtin_prefetch (&mBoxes [wNodeIndex], 0 , 3 );
1633+ #elif defined(_MSC_VER)
1634+ _mm_prefetch (detail::bit_cast<const char *>(&mBoxes [wNodeIndex]), _MM_HINT_T0);
1635+ #endif
16141636 }
16151637
16161638 return wResults;
@@ -1666,6 +1688,12 @@ std::vector<size_t> Flatbush<ArrayType>::neighbors(const Point<ArrayType>& iPoin
16661688
16671689 wNodeIndex = wQueue.top ().mId >> 3U ; // 1 to undo indexing + 2 for binary compatibility with JS
16681690 wQueue.pop ();
1691+
1692+ #ifdef __GNUC__
1693+ __builtin_prefetch (&mBoxes [wNodeIndex], 0 , 3 );
1694+ #elif defined(_MSC_VER)
1695+ _mm_prefetch (detail::bit_cast<const char *>(&mBoxes [wNodeIndex]), _MM_HINT_T0);
1696+ #endif
16691697 }
16701698
16711699 return wResults;
0 commit comments