added avx2 kernels, added inf/0.f/neg masks for all kernels

Ka-zam · Ka-zam · commit 575ee8648b84 · 2026-01-19T15:37:50.000+01:00
Signed-off-by: Magnus Lundmark &lt;magnuslundmark@gmail.com&gt;
diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h
@@ -18,6 +18,33 @@
 #include "volk/volk_avx_intrinsics.h"
 #include <immintrin.h>
 
+/*
+ * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
+ * AVX2 version with native 256-bit integer operations for edge case handling
+ * Handles edge cases: +0 → +Inf, +Inf → 0
+ */
+static inline __m256 _mm256_rsqrt_nr_avx2(const __m256 a)
+{
+    const __m256 HALF = _mm256_set1_ps(0.5f);
+    const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
+
+    const __m256 x0 = _mm256_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
+    __m256 x1 = _mm256_mul_ps(
+        x0,
+        _mm256_sub_ps(THREE_HALFS,
+                      _mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // AVX2: native 256-bit integer compare
+    __m256i a_si = _mm256_castps_si256(a);
+    __m256i zero_mask = _mm256_cmpeq_epi32(a_si, _mm256_setzero_si256());
+    __m256i inf_mask = _mm256_cmpeq_epi32(a_si, _mm256_set1_epi32(0x7F800000));
+    __m256 special_mask = _mm256_castsi256_ps(_mm256_or_si256(zero_mask, inf_mask));
+    return _mm256_blendv_ps(x1, x0, special_mask);
+}
+
 static inline __m256 _mm256_real(const __m256 z1, const __m256 z2)
 {
     const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h
@@ -20,15 +20,26 @@
 // Newton-Raphson refined reciprocal square root: 1/sqrt(a)
 // One iteration doubles precision from ~12-bit to ~24-bit
 // x1 = x0 * (1.5 - 0.5 * a * x0^2)
+// Handles edge cases: +0 → +Inf, +Inf → 0
 // Requires AVX512F
 ////////////////////////////////////////////////////////////////////////
 static inline __m512 _mm512_rsqrt_nr_ps(const __m512 a)
 {
     const __m512 HALF = _mm512_set1_ps(0.5f);
     const __m512 THREE_HALFS = _mm512_set1_ps(1.5f);
-    const __m512 x0 = _mm512_rsqrt14_ps(a);
-    return _mm512_mul_ps(
+
+    const __m512 x0 = _mm512_rsqrt14_ps(a); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
+    __m512 x1 = _mm512_mul_ps(
         x0, _mm512_fnmadd_ps(HALF, _mm512_mul_ps(_mm512_mul_ps(x0, x0), a), THREE_HALFS));
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // Blend: use x0 where a == +0 or a == +Inf, else use x1
+    __m512i a_si = _mm512_castps_si512(a);
+    __mmask16 zero_mask = _mm512_cmpeq_epi32_mask(a_si, _mm512_setzero_si512());
+    __mmask16 inf_mask = _mm512_cmpeq_epi32_mask(a_si, _mm512_set1_epi32(0x7F800000));
+    return _mm512_mask_blend_ps(zero_mask | inf_mask, x1, x0);
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -21,16 +21,37 @@
  * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
  * One iteration doubles precision from ~12-bit to ~24-bit
  * x1 = x0 * (1.5 - 0.5 * a * x0^2)
+ * Handles edge cases: +0 → +Inf, +Inf → 0
  */
 static inline __m256 _mm256_rsqrt_nr_ps(const __m256 a)
 {
     const __m256 HALF = _mm256_set1_ps(0.5f);
     const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
-    const __m256 x0 = _mm256_rsqrt_ps(a);
-    return _mm256_mul_ps(
+
+    const __m256 x0 = _mm256_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
+    __m256 x1 = _mm256_mul_ps(
         x0,
         _mm256_sub_ps(THREE_HALFS,
                       _mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // Blend: use x0 where a == +0 or a == +Inf, else use x1
+    // AVX-only: use SSE2 integer compare, then reconstruct AVX mask
+    __m128i a_lo = _mm256_castsi256_si128(_mm256_castps_si256(a));
+    __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
+    __m128i zero_si = _mm_setzero_si128();
+    __m128i inf_si = _mm_set1_epi32(0x7F800000);
+    __m128i zero_mask_lo = _mm_cmpeq_epi32(a_lo, zero_si);
+    __m128i zero_mask_hi = _mm_cmpeq_epi32(a_hi, zero_si);
+    __m128i inf_mask_lo = _mm_cmpeq_epi32(a_lo, inf_si);
+    __m128i inf_mask_hi = _mm_cmpeq_epi32(a_hi, inf_si);
+    __m128 mask_lo = _mm_castsi128_ps(_mm_or_si128(zero_mask_lo, inf_mask_lo));
+    __m128 mask_hi = _mm_castsi128_ps(_mm_or_si128(zero_mask_hi, inf_mask_hi));
+    __m256 special_mask =
+        _mm256_insertf128_ps(_mm256_castps128_ps256(mask_lo), mask_hi, 1);
+    return _mm256_blendv_ps(x1, x0, special_mask);
 }
 
 /*
diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h
@@ -80,16 +80,23 @@ static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
     return result;
 }
 
-/* Inverse square root for float32x4_t */
+/* Inverse square root for float32x4_t
+ * Handles edge cases: +0 → +Inf, +Inf → 0 */
 static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
 {
-    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-
-    return sqrt_reciprocal;
+    float32x4_t x0 = vrsqrteq_f32(x); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson refinement using vrsqrtsq_f32
+    float32x4_t x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x0), x0), x0);
+    x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x1), x1), x1);
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // Blend: use x0 where x == +0 or x == +Inf, else use x1
+    uint32x4_t x_bits = vreinterpretq_u32_f32(x);
+    uint32x4_t zero_mask = vceqq_u32(x_bits, vdupq_n_u32(0x00000000));
+    uint32x4_t inf_mask = vceqq_u32(x_bits, vdupq_n_u32(0x7F800000));
+    uint32x4_t special_mask = vorrq_u32(zero_mask, inf_mask);
+    return vbslq_f32(special_mask, x0, x1);
 }
 
 /* Square root for ARMv7 NEON (no vsqrtq_f32)
diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h
@@ -15,20 +15,34 @@
 
 #ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
 #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
+#include <emmintrin.h>
 #include <xmmintrin.h>
 
 /*
  * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
  * One iteration doubles precision from ~12-bit to ~24-bit
  * x1 = x0 * (1.5 - 0.5 * a * x0^2)
+ * Handles edge cases: +0 → +Inf, +Inf → 0
  */
 static inline __m128 _mm_rsqrt_nr_ps(const __m128 a)
 {
     const __m128 HALF = _mm_set1_ps(0.5f);
     const __m128 THREE_HALFS = _mm_set1_ps(1.5f);
-    const __m128 x0 = _mm_rsqrt_ps(a);
-    return _mm_mul_ps(
+
+    const __m128 x0 = _mm_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
+    __m128 x1 = _mm_mul_ps(
         x0, _mm_sub_ps(THREE_HALFS, _mm_mul_ps(HALF, _mm_mul_ps(_mm_mul_ps(x0, x0), a))));
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // Blend: use x0 where a == +0 or a == +Inf, else use x1
+    __m128i a_si = _mm_castps_si128(a);
+    __m128i zero_mask = _mm_cmpeq_epi32(a_si, _mm_setzero_si128());
+    __m128i inf_mask = _mm_cmpeq_epi32(a_si, _mm_set1_epi32(0x7F800000));
+    __m128 special_mask = _mm_castsi128_ps(_mm_or_si128(zero_mask, inf_mask));
+    // SSE2-compatible blend: (x0 & mask) | (x1 & ~mask)
+    return _mm_or_ps(_mm_and_ps(special_mask, x0), _mm_andnot_ps(special_mask, x1));
 }
 
 /*
diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h