add newton-raphson refinement to volk_32f_invsqrt_32f

Ka-zam · Ka-zam · commit 6bd327d5827e · 2026-01-16T14:47:04.000+01:00
Signed-off-by: Magnus Lundmark &lt;magnuslundmark@gmail.com&gt;
diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h
@@ -1,6 +1,6 @@
 /* -*- c++ -*- */
 /*
- * Copyright 2024 Magnus Lundmark <magnuslundmark@gmail.com>
+ * Copyright 2024-2026 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -16,6 +16,21 @@
 #define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
 #include <immintrin.h>
 
+////////////////////////////////////////////////////////////////////////
+// Newton-Raphson refined reciprocal square root: 1/sqrt(a)
+// One iteration doubles precision from ~12-bit to ~24-bit
+// x1 = x0 * (1.5 - 0.5 * a * x0^2)
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
+static inline __m512 _mm512_rsqrt_nr_ps(const __m512 a)
+{
+    const __m512 HALF = _mm512_set1_ps(0.5f);
+    const __m512 THREE_HALFS = _mm512_set1_ps(1.5f);
+    const __m512 x0 = _mm512_rsqrt14_ps(a);
+    return _mm512_mul_ps(
+        x0, _mm512_fnmadd_ps(HALF, _mm512_mul_ps(_mm512_mul_ps(x0, x0), a), THREE_HALFS));
+}
+
 ////////////////////////////////////////////////////////////////////////
 // Place real parts of two complex vectors in output
 // Requires AVX512F
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
+ * Copyright 2023-2026 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -17,6 +17,22 @@
 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
 #include <immintrin.h>
 
+/*
+ * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
+ * One iteration doubles precision from ~12-bit to ~24-bit
+ * x1 = x0 * (1.5 - 0.5 * a * x0^2)
+ */
+static inline __m256 _mm256_rsqrt_nr_ps(const __m256 a)
+{
+    const __m256 HALF = _mm256_set1_ps(0.5f);
+    const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
+    const __m256 x0 = _mm256_rsqrt_ps(a);
+    return _mm256_mul_ps(
+        x0,
+        _mm256_sub_ps(THREE_HALFS,
+                      _mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
+}
+
 /*
  * Approximate arctan(x) via polynomial expansion
  * on the interval [-1, 1]
diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
- * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
+ * Copyright 2025, 2026 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
+ * Copyright 2023-2026 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -17,6 +17,20 @@
 #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
 #include <xmmintrin.h>
 
+/*
+ * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
+ * One iteration doubles precision from ~12-bit to ~24-bit
+ * x1 = x0 * (1.5 - 0.5 * a * x0^2)
+ */
+static inline __m128 _mm_rsqrt_nr_ps(const __m128 a)
+{
+    const __m128 HALF = _mm_set1_ps(0.5f);
+    const __m128 THREE_HALFS = _mm_set1_ps(1.5f);
+    const __m128 x0 = _mm_rsqrt_ps(a);
+    return _mm_mul_ps(
+        x0, _mm_sub_ps(THREE_HALFS, _mm_mul_ps(HALF, _mm_mul_ps(_mm_mul_ps(x0, x0), a))));
+}
+
 /*
  * Approximate arctan(x) via polynomial expansion
  * on the interval [-1, 1]
diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`/* -- c++ -- */`
`2`	`2`	`/*`
`3`	`3`	`* Copyright 2015 Free Software Foundation, Inc.`
`4`		`- * Copyright 2025 Magnus Lundmark <[email protected]>`
	`4`	`+ * Copyright 2025, 2026 Magnus Lundmark <[email protected]>`
`5`	`5`	`*`
`6`	`6`	`* This file is part of VOLK`
`7`	`7`	`*`