gnuradio · jdemel · Jan 29, 2026 · Jan 19, 2026 · Jan 20, 2026
diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h
@@ -18,6 +18,33 @@
 #include "volk/volk_avx_intrinsics.h"
 #include <immintrin.h>
 
+/*
+ * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
+ * AVX2 version with native 256-bit integer operations for edge case handling
+ * Handles edge cases: +0 → +Inf, +Inf → 0
+ */
+static inline __m256 _mm256_rsqrt_nr_avx2(const __m256 a)
+{
+    const __m256 HALF = _mm256_set1_ps(0.5f);
+    const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
+
+    const __m256 x0 = _mm256_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
+    __m256 x1 = _mm256_mul_ps(
+        x0,
+        _mm256_sub_ps(THREE_HALFS,
+                      _mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // AVX2: native 256-bit integer compare
+    __m256i a_si = _mm256_castps_si256(a);
+    __m256i zero_mask = _mm256_cmpeq_epi32(a_si, _mm256_setzero_si256());
+    __m256i inf_mask = _mm256_cmpeq_epi32(a_si, _mm256_set1_epi32(0x7F800000));
+    __m256 special_mask = _mm256_castsi256_ps(_mm256_or_si256(zero_mask, inf_mask));
+    return _mm256_blendv_ps(x1, x0, special_mask);
+}
+
 static inline __m256 _mm256_real(const __m256 z1, const __m256 z2)
 {
     const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);

diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h
@@ -1,6 +1,6 @@
 /* -*- c++ -*- */
 /*
- * Copyright 2024 Magnus Lundmark <[email protected]>
+ * Copyright 2024-2026 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -16,6 +16,32 @@
 #define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
 #include <immintrin.h>
 
+////////////////////////////////////////////////////////////////////////
+// Newton-Raphson refined reciprocal square root: 1/sqrt(a)
+// One iteration doubles precision from ~12-bit to ~24-bit
+// x1 = x0 * (1.5 - 0.5 * a * x0^2)
+// Handles edge cases: +0 → +Inf, +Inf → 0
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
+static inline __m512 _mm512_rsqrt_nr_ps(const __m512 a)
+{
+    const __m512 HALF = _mm512_set1_ps(0.5f);
+    const __m512 THREE_HALFS = _mm512_set1_ps(1.5f);
+
+    const __m512 x0 = _mm512_rsqrt14_ps(a); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
+    __m512 x1 = _mm512_mul_ps(
+        x0, _mm512_fnmadd_ps(HALF, _mm512_mul_ps(_mm512_mul_ps(x0, x0), a), THREE_HALFS));
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // Blend: use x0 where a == +0 or a == +Inf, else use x1
+    __m512i a_si = _mm512_castps_si512(a);
+    __mmask16 zero_mask = _mm512_cmpeq_epi32_mask(a_si, _mm512_setzero_si512());
+    __mmask16 inf_mask = _mm512_cmpeq_epi32_mask(a_si, _mm512_set1_epi32(0x7F800000));
+    return _mm512_mask_blend_ps(zero_mask | inf_mask, x1, x0);
+}
+
 ////////////////////////////////////////////////////////////////////////
 // Place real parts of two complex vectors in output
 // Requires AVX512F

diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <[email protected]>
+ * Copyright 2023-2026 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -17,6 +17,43 @@
 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
 #include <immintrin.h>
 
+/*
+ * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
+ * One iteration doubles precision from ~12-bit to ~24-bit
+ * x1 = x0 * (1.5 - 0.5 * a * x0^2)
+ * Handles edge cases: +0 → +Inf, +Inf → 0
+ */
+static inline __m256 _mm256_rsqrt_nr_ps(const __m256 a)
+{
+    const __m256 HALF = _mm256_set1_ps(0.5f);
+    const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
+
+    const __m256 x0 = _mm256_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
+    __m256 x1 = _mm256_mul_ps(
+        x0,
+        _mm256_sub_ps(THREE_HALFS,
+                      _mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // Blend: use x0 where a == +0 or a == +Inf, else use x1
+    // AVX-only: use SSE2 integer compare, then reconstruct AVX mask
+    __m128i a_lo = _mm256_castsi256_si128(_mm256_castps_si256(a));
+    __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
+    __m128i zero_si = _mm_setzero_si128();
+    __m128i inf_si = _mm_set1_epi32(0x7F800000);
+    __m128i zero_mask_lo = _mm_cmpeq_epi32(a_lo, zero_si);
+    __m128i zero_mask_hi = _mm_cmpeq_epi32(a_hi, zero_si);
+    __m128i inf_mask_lo = _mm_cmpeq_epi32(a_lo, inf_si);
+    __m128i inf_mask_hi = _mm_cmpeq_epi32(a_hi, inf_si);
+    __m128 mask_lo = _mm_castsi128_ps(_mm_or_si128(zero_mask_lo, inf_mask_lo));
+    __m128 mask_hi = _mm_castsi128_ps(_mm_or_si128(zero_mask_hi, inf_mask_hi));
+    __m256 special_mask =
+        _mm256_insertf128_ps(_mm256_castps128_ps256(mask_lo), mask_hi, 1);
+    return _mm256_blendv_ps(x1, x0, special_mask);
+}
+
 /*
  * Approximate arctan(x) via polynomial expansion
  * on the interval [-1, 1]

diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
- * Copyright 2025 Magnus Lundmark <[email protected]>
+ * Copyright 2025, 2026 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -80,16 +80,23 @@ static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
     return result;
 }
 
-/* Inverse square root for float32x4_t */
+/* Inverse square root for float32x4_t
+ * Handles edge cases: +0 → +Inf, +Inf → 0 */
 static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
 {
-    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-
-    return sqrt_reciprocal;
+    float32x4_t x0 = vrsqrteq_f32(x); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson refinement using vrsqrtsq_f32
+    float32x4_t x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x0), x0), x0);
+    x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x1), x1), x1);
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // Blend: use x0 where x == +0 or x == +Inf, else use x1
+    uint32x4_t x_bits = vreinterpretq_u32_f32(x);
+    uint32x4_t zero_mask = vceqq_u32(x_bits, vdupq_n_u32(0x00000000));
+    uint32x4_t inf_mask = vceqq_u32(x_bits, vdupq_n_u32(0x7F800000));
+    uint32x4_t special_mask = vorrq_u32(zero_mask, inf_mask);
+    return vbslq_f32(special_mask, x0, x1);
 }
 
 /* Square root for ARMv7 NEON (no vsqrtq_f32)

diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <[email protected]>
+ * Copyright 2023-2026 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -15,8 +15,36 @@
 
 #ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
 #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
+#include <emmintrin.h>
 #include <xmmintrin.h>
 
+/*
+ * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
+ * One iteration doubles precision from ~12-bit to ~24-bit
+ * x1 = x0 * (1.5 - 0.5 * a * x0^2)
+ * Handles edge cases: +0 → +Inf, +Inf → 0
+ */
+static inline __m128 _mm_rsqrt_nr_ps(const __m128 a)
+{
+    const __m128 HALF = _mm_set1_ps(0.5f);
+    const __m128 THREE_HALFS = _mm_set1_ps(1.5f);
+
+    const __m128 x0 = _mm_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
+
+    // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
+    __m128 x1 = _mm_mul_ps(
+        x0, _mm_sub_ps(THREE_HALFS, _mm_mul_ps(HALF, _mm_mul_ps(_mm_mul_ps(x0, x0), a))));
+
+    // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
+    // Blend: use x0 where a == +0 or a == +Inf, else use x1
+    __m128i a_si = _mm_castps_si128(a);
+    __m128i zero_mask = _mm_cmpeq_epi32(a_si, _mm_setzero_si128());
+    __m128i inf_mask = _mm_cmpeq_epi32(a_si, _mm_set1_epi32(0x7F800000));
+    __m128 special_mask = _mm_castsi128_ps(_mm_or_si128(zero_mask, inf_mask));
+    // SSE2-compatible blend: (x0 & mask) | (x1 & ~mask)
+    return _mm_or_ps(_mm_and_ps(special_mask, x0), _mm_andnot_ps(special_mask, x1));
+}
+
 /*
  * Approximate arctan(x) via polynomial expansion
  * on the interval [-1, 1]