Skip to content

Commit 6bd327d

Browse files
committed
add newton-raphson refinement to volk_32f_invsqrt_32f
Signed-off-by: Magnus Lundmark <[email protected]>
1 parent cda245e commit 6bd327d

File tree

6 files changed

+206
-67
lines changed

6 files changed

+206
-67
lines changed

include/volk/volk_avx512_intrinsics.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- c++ -*- */
22
/*
3-
* Copyright 2024 Magnus Lundmark <[email protected]>
3+
* Copyright 2024-2026 Magnus Lundmark <[email protected]>
44
*
55
* This file is part of VOLK
66
*
@@ -16,6 +16,21 @@
1616
#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
1717
#include <immintrin.h>
1818

19+
////////////////////////////////////////////////////////////////////////
20+
// Newton-Raphson refined reciprocal square root: 1/sqrt(a)
21+
// One iteration doubles precision from ~12-bit to ~24-bit
22+
// x1 = x0 * (1.5 - 0.5 * a * x0^2)
23+
// Requires AVX512F
24+
////////////////////////////////////////////////////////////////////////
25+
static inline __m512 _mm512_rsqrt_nr_ps(const __m512 a)
26+
{
27+
const __m512 HALF = _mm512_set1_ps(0.5f);
28+
const __m512 THREE_HALFS = _mm512_set1_ps(1.5f);
29+
const __m512 x0 = _mm512_rsqrt14_ps(a);
30+
return _mm512_mul_ps(
31+
x0, _mm512_fnmadd_ps(HALF, _mm512_mul_ps(_mm512_mul_ps(x0, x0), a), THREE_HALFS));
32+
}
33+
1934
////////////////////////////////////////////////////////////////////////
2035
// Place real parts of two complex vectors in output
2136
// Requires AVX512F

include/volk/volk_avx_intrinsics.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* -*- c++ -*- */
22
/*
33
* Copyright 2015 Free Software Foundation, Inc.
4-
* Copyright 2023 Magnus Lundmark <[email protected]>
4+
* Copyright 2023-2026 Magnus Lundmark <[email protected]>
55
*
66
* This file is part of VOLK
77
*
@@ -17,6 +17,22 @@
1717
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
1818
#include <immintrin.h>
1919

20+
/*
21+
* Newton-Raphson refined reciprocal square root: 1/sqrt(a)
22+
* One iteration doubles precision from ~12-bit to ~24-bit
23+
* x1 = x0 * (1.5 - 0.5 * a * x0^2)
24+
*/
25+
static inline __m256 _mm256_rsqrt_nr_ps(const __m256 a)
26+
{
27+
const __m256 HALF = _mm256_set1_ps(0.5f);
28+
const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
29+
const __m256 x0 = _mm256_rsqrt_ps(a);
30+
return _mm256_mul_ps(
31+
x0,
32+
_mm256_sub_ps(THREE_HALFS,
33+
_mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
34+
}
35+
2036
/*
2137
* Approximate arctan(x) via polynomial expansion
2238
* on the interval [-1, 1]

include/volk/volk_neon_intrinsics.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* -*- c++ -*- */
22
/*
33
* Copyright 2015 Free Software Foundation, Inc.
4-
* Copyright 2025 Magnus Lundmark <[email protected]>
4+
* Copyright 2025, 2026 Magnus Lundmark <[email protected]>
55
*
66
* This file is part of VOLK
77
*

include/volk/volk_sse_intrinsics.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* -*- c++ -*- */
22
/*
33
* Copyright 2015 Free Software Foundation, Inc.
4-
* Copyright 2023 Magnus Lundmark <[email protected]>
4+
* Copyright 2023-2026 Magnus Lundmark <[email protected]>
55
*
66
* This file is part of VOLK
77
*
@@ -17,6 +17,20 @@
1717
#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
1818
#include <xmmintrin.h>
1919

20+
/*
21+
* Newton-Raphson refined reciprocal square root: 1/sqrt(a)
22+
* One iteration doubles precision from ~12-bit to ~24-bit
23+
* x1 = x0 * (1.5 - 0.5 * a * x0^2)
24+
*/
25+
static inline __m128 _mm_rsqrt_nr_ps(const __m128 a)
26+
{
27+
const __m128 HALF = _mm_set1_ps(0.5f);
28+
const __m128 THREE_HALFS = _mm_set1_ps(1.5f);
29+
const __m128 x0 = _mm_rsqrt_ps(a);
30+
return _mm_mul_ps(
31+
x0, _mm_sub_ps(THREE_HALFS, _mm_mul_ps(HALF, _mm_mul_ps(_mm_mul_ps(x0, x0), a))));
32+
}
33+
2034
/*
2135
* Approximate arctan(x) via polynomial expansion
2236
* on the interval [-1, 1]

0 commit comments

Comments
 (0)