diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h index 7c75818a..9b84ef7a 100644 --- a/Source/astcenc_vecmathlib_avx2_8.h +++ b/Source/astcenc_vecmathlib_avx2_8.h @@ -458,13 +458,12 @@ ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b) */ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a) { - __m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); - m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); - m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); - m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); + // Build min within groups of 2, then 4, then 8 + __m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))); + m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2))); + m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01)); - __m256i r = astcenc_mm256_set_m128i(m, m); - vint8 vmin(r); + vint8 vmin(m); return vmin; } @@ -481,13 +480,12 @@ ASTCENC_SIMD_INLINE int hmin_s(vint8 a) */ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a) { - __m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); - m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); - m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); - m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); + // Build max within groups of 2, then 4, then 8 + __m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))); + m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2))); + m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01)); - __m256i r = astcenc_mm256_set_m128i(m, m); - vint8 vmax(r); + vint8 vmax(m); return vmax; } diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h index 5c726b6a..0c42c73d 100644 --- a/Source/astcenc_vecmathlib_sse_4.h +++ b/Source/astcenc_vecmathlib_sse_4.h @@ -606,9 +606,9 @@ ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b) */ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) { - a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2)))); - a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1)))); - return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0))); + a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)))); + a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2)))); + return a; } /* @@ -616,9 +616,9 @@ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) */ ASTCENC_SIMD_INLINE vint4 hmax(vint4 a) { - a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2)))); - a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1)))); - return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0))); + a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)))); + a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2)))); + return a; } /**