From b79248ae0ee9aa3ee6d8d0546642c2cdc5cf1a26 Mon Sep 17 00:00:00 2001 From: Fabian Giesen Date: Fri, 1 Nov 2024 14:12:44 -0700 Subject: [PATCH] Better hmin/hmax algorithms for SSE/AVX2 Use a formulation that automatically produces the same result in all lanes, avoiding a separate broadcast step. The same approach would work with floats in principle, but it's not guaranteed to give the same result in all lanes when NaNs are involved (due to the way MINPS/MAXPS are defined), so leave the float versions alone for now. About 1% encode time reduction encoding a 8192x8192 test texture at 6x6 -thorough on a Ryzen 7950X3D. --- Source/astcenc_vecmathlib_avx2_8.h | 22 ++++++++++------------ Source/astcenc_vecmathlib_sse_4.h | 12 ++++++------ 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h index 7c75818a..9b84ef7a 100644 --- a/Source/astcenc_vecmathlib_avx2_8.h +++ b/Source/astcenc_vecmathlib_avx2_8.h @@ -458,13 +458,12 @@ ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b) */ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a) { - __m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); - m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); - m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); - m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); + // Build min within groups of 2, then 4, then 8 + __m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))); + m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2))); + m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01)); - __m256i r = astcenc_mm256_set_m128i(m, m); - vint8 vmin(r); + vint8 vmin(m); return vmin; } @@ -481,13 +480,12 @@ ASTCENC_SIMD_INLINE int hmin_s(vint8 a) */ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a) { - __m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); - m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); - m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); - m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); + // Build max within groups of 2, then 4, then 8 + __m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))); + m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2))); + m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01)); - __m256i r = astcenc_mm256_set_m128i(m, m); - vint8 vmax(r); + vint8 vmax(m); return vmax; } diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h index 5c726b6a..0c42c73d 100644 --- a/Source/astcenc_vecmathlib_sse_4.h +++ b/Source/astcenc_vecmathlib_sse_4.h @@ -606,9 +606,9 @@ ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b) */ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) { - a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2)))); - a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1)))); - return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0))); + a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)))); + a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2)))); + return a; } /* @@ -616,9 +616,9 @@ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) */ ASTCENC_SIMD_INLINE vint4 hmax(vint4 a) { - a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2)))); - a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1)))); - return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0))); + a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)))); + a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2)))); + return a; } /**