Skip to content

Commit 9946b99

Browse files
committed
Cleanup
1 parent abca606 commit 9946b99

File tree

2 files changed

+53
-25
lines changed

2 files changed

+53
-25
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ RUN apt update -y \
1010
curl \
1111
git \
1212
tar \
13-
unzip
13+
unzip \
1414
zip \
1515
&& apt clean all \
1616
&& rm -rf /var/cache/tdnf

flatbush.h

Lines changed: 52 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -380,10 +380,9 @@ static const auto kMaskInterleave3 = _mm_set1_epi32(0x33333333);
380380
static const auto kMaskInterleave4 = _mm_set1_epi32(0x55555555);
381381

382382
#if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX512
383-
static const auto kShuffleMinX512 = _mm512_setr_epi64(0, 4, 8, 12, 0, 0, 0, 0);
384-
static const auto kShuffleMinY512 = _mm512_setr_epi64(1, 5, 9, 13, 0, 0, 0, 0);
385-
static const auto kShuffleMaxX512 = _mm512_setr_epi64(2, 6, 10, 14, 0, 0, 0, 0);
386-
static const auto kShuffleMaxY512 = _mm512_setr_epi64(3, 7, 11, 15, 0, 0, 0, 0);
383+
static const auto kPermuteMinXY512 = _mm512_setr_epi64(0, 1, 4, 5, 8, 9, 12, 13);
384+
static const auto kPermuteMaxXY512 = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
385+
static const auto kPermuteXLoYHi = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
387386
#endif
388387

389388
inline __m128i Interleave(__m128i v) {
@@ -1009,10 +1008,17 @@ std::vector<uint32_t> computeHilbertValues(size_t iNumItems,
10091008
auto wIdx = 0UL;
10101009

10111010
#if defined(FLATBUSH_USE_SIMD)
1011+
#if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX
1012+
const auto wHilbertWidth128 = _mm_broadcast_ss(&wHilbertWidth);
1013+
const auto wHilbertHeight128 = _mm_broadcast_ss(&wHilbertHeight);
1014+
const auto wDoubleMinX128 = _mm_broadcast_ss(&wDoubleMinX);
1015+
const auto wDoubleMinY128 = _mm_broadcast_ss(&wDoubleMinY);
1016+
#else
10121017
const auto wHilbertWidth128 = _mm_set1_ps(wHilbertWidth);
10131018
const auto wHilbertHeight128 = _mm_set1_ps(wHilbertHeight);
10141019
const auto wDoubleMinX128 = _mm_set1_ps(wDoubleMinX);
10151020
const auto wDoubleMinY128 = _mm_set1_ps(wDoubleMinY);
1021+
#endif
10161022

10171023
for (; wIdx + 3 < iNumItems; wIdx += 4) {
10181024
__m128 wMinX;
@@ -1041,9 +1047,9 @@ std::vector<uint32_t> computeHilbertValues(size_t iNumItems,
10411047
}
10421048

10431049
template <>
1044-
inline std::vector<uint32_t> computeHilbertValues<double>(size_t iNumItems,
1045-
const Box<double>& iBounds,
1046-
span<Box<double>> iBoxes) {
1050+
std::vector<uint32_t> computeHilbertValues<double>(size_t iNumItems,
1051+
const Box<double>& iBounds,
1052+
span<Box<double>> iBoxes) {
10471053
static constexpr auto kMaxHilbertRatio = 0.5 * std::numeric_limits<uint16_t>::max();
10481054
const auto wHilbertWidth = kMaxHilbertRatio / (iBounds.mMaxX - iBounds.mMinX);
10491055
const auto wHilbertHeight = kMaxHilbertRatio / (iBounds.mMaxY - iBounds.mMinY);
@@ -1053,25 +1059,36 @@ inline std::vector<uint32_t> computeHilbertValues<double>(size_t iNumItems,
10531059
auto wIdx = 0UL;
10541060

10551061
#if defined(FLATBUSH_USE_SIMD)
1056-
#if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX
1057-
const auto wHilbertWidth256 = _mm256_set1_pd(wHilbertWidth);
1058-
const auto wHilbertHeight256 = _mm256_set1_pd(wHilbertHeight);
1059-
const auto wDoubleMinX256 = _mm256_set1_pd(wDoubleMinX);
1060-
const auto wDoubleMinY256 = _mm256_set1_pd(wDoubleMinY);
1062+
#if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX512
1063+
const auto wHilbertWidth512 = _mm512_set1_pd(wHilbertWidth);
1064+
const auto wHilbertHeight512 = _mm512_set1_pd(wHilbertHeight);
1065+
const auto wDoubleMinX512 = _mm512_set1_pd(wDoubleMinX);
1066+
const auto wDoubleMinY512 = _mm512_set1_pd(wDoubleMinY);
1067+
const auto wWidthHeight512 = _mm512_mask_blend_pd(0xAA, wHilbertWidth512, wHilbertHeight512);
1068+
const auto wDoubleMinXY512 = _mm512_mask_blend_pd(0xAA, wDoubleMinX512, wDoubleMinY512);
10611069

10621070
for (; wIdx + 3 < iNumItems; wIdx += 4) {
1063-
#if FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX512
10641071
const auto wBoxes01 = _mm512_loadu_pd(&iBoxes[wIdx].mMinX);
10651072
const auto wBoxes23 = _mm512_loadu_pd(&iBoxes[wIdx + 2].mMinX);
1066-
const auto wMinX =
1067-
_mm512_castpd512_pd256(_mm512_permutex2var_pd(wBoxes01, kShuffleMinX512, wBoxes23));
1068-
const auto wMinY =
1069-
_mm512_castpd512_pd256(_mm512_permutex2var_pd(wBoxes01, kShuffleMinY512, wBoxes23));
1070-
const auto wMaxX =
1071-
_mm512_castpd512_pd256(_mm512_permutex2var_pd(wBoxes01, kShuffleMaxX512, wBoxes23));
1072-
const auto wMaxY =
1073-
_mm512_castpd512_pd256(_mm512_permutex2var_pd(wBoxes01, kShuffleMaxY512, wBoxes23));
1074-
#else
1073+
const auto wMin = _mm512_permutex2var_pd(wBoxes01, kPermuteMinXY512, wBoxes23);
1074+
const auto wMax = _mm512_permutex2var_pd(wBoxes01, kPermuteMaxXY512, wBoxes23);
1075+
const auto wResult = _mm256_permutevar8x32_epi32(
1076+
_mm512_cvtpd_epi32(_mm512_mul_pd(
1077+
wWidthHeight512, _mm512_sub_pd(_mm512_add_pd(wMin, wMax), wDoubleMinXY512))),
1078+
kPermuteXLoYHi);
1079+
const auto wResultX = _mm256_castsi256_si128(wResult);
1080+
const auto wResultY = _mm256_extracti32x4_epi32(wResult, 1);
1081+
1082+
_mm_storeu_si128(bit_cast<__m128i*>(&wHilbertValues[wIdx]),
1083+
HilbertXYToIndex(wResultX, wResultY));
1084+
}
1085+
#elif FLATBUSH_USE_SIMD >= FLATBUSH_USE_AVX
1086+
const auto wHilbertWidth256 = _mm256_broadcast_sd(&wHilbertWidth);
1087+
const auto wHilbertHeight256 = _mm256_broadcast_sd(&wHilbertHeight);
1088+
const auto wDoubleMinX256 = _mm256_broadcast_sd(&wDoubleMinX);
1089+
const auto wDoubleMinY256 = _mm256_broadcast_sd(&wDoubleMinY);
1090+
1091+
for (; wIdx + 3 < iNumItems; wIdx += 4) {
10751092
const auto wBox0 = _mm256_loadu_pd(&iBoxes[wIdx].mMinX);
10761093
const auto wBox1 = _mm256_loadu_pd(&iBoxes[wIdx + 1].mMinX);
10771094
const auto wBox2 = _mm256_loadu_pd(&iBoxes[wIdx + 2].mMinX);
@@ -1084,7 +1101,6 @@ inline std::vector<uint32_t> computeHilbertValues<double>(size_t iNumItems,
10841101
const auto wMinY = _mm256_permute2f128_pd(wBoxes01Hi, wBoxes23Hi, 0x20);
10851102
const auto wMaxX = _mm256_permute2f128_pd(wBoxes01Lo, wBoxes23Lo, 0x31);
10861103
const auto wMaxY = _mm256_permute2f128_pd(wBoxes01Hi, wBoxes23Hi, 0x31);
1087-
#endif
10881104
const auto wSumX = _mm256_add_pd(wMinX, wMaxX);
10891105
const auto wSumY = _mm256_add_pd(wMinY, wMaxY);
10901106
const auto wResultX = _mm256_mul_pd(wHilbertWidth256, _mm256_sub_pd(wSumX, wDoubleMinX256));
@@ -1462,7 +1478,7 @@ void Flatbush<ArrayType>::create(std::vector<Box<ArrayType>>&& iItems) noexcept
14621478
auto wNodeBox = mBoxes[wPosition];
14631479

14641480
// calculate bbox for the new node
1465-
for (size_t wCount = 0; wCount < wNodeSize && wPosition < wEnd; ++wCount, ++wPosition) {
1481+
for (size_t wCount = 0UL; wCount < wNodeSize && wPosition < wEnd; ++wCount, ++wPosition) {
14661482
detail::updateBounds(wNodeBox, mBoxes[wPosition]);
14671483
}
14681484

@@ -1611,6 +1627,12 @@ std::vector<size_t> Flatbush<ArrayType>::search(const Box<ArrayType>& iBounds,
16111627

16121628
wNodeIndex = wQueue.back() >> 2U; // for binary compatibility with JS
16131629
wQueue.pop_back();
1630+
1631+
#ifdef __GNUC__
1632+
__builtin_prefetch(&mBoxes[wNodeIndex], 0, 3);
1633+
#elif defined(_MSC_VER)
1634+
_mm_prefetch(detail::bit_cast<const char*>(&mBoxes[wNodeIndex]), _MM_HINT_T0);
1635+
#endif
16141636
}
16151637

16161638
return wResults;
@@ -1666,6 +1688,12 @@ std::vector<size_t> Flatbush<ArrayType>::neighbors(const Point<ArrayType>& iPoin
16661688

16671689
wNodeIndex = wQueue.top().mId >> 3U; // 1 to undo indexing + 2 for binary compatibility with JS
16681690
wQueue.pop();
1691+
1692+
#ifdef __GNUC__
1693+
__builtin_prefetch(&mBoxes[wNodeIndex], 0, 3);
1694+
#elif defined(_MSC_VER)
1695+
_mm_prefetch(detail::bit_cast<const char*>(&mBoxes[wNodeIndex]), _MM_HINT_T0);
1696+
#endif
16691697
}
16701698

16711699
return wResults;

0 commit comments

Comments
 (0)