Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions include/volk/volk_avx2_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,33 @@
#include "volk/volk_avx_intrinsics.h"
#include <immintrin.h>

/*
* Newton-Raphson refined reciprocal square root: 1/sqrt(a)
* AVX2 version with native 256-bit integer operations for edge case handling
* Handles edge cases: +0 → +Inf, +Inf → 0
*/
static inline __m256 _mm256_rsqrt_nr_avx2(const __m256 a)
{
const __m256 HALF = _mm256_set1_ps(0.5f);
const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);

const __m256 x0 = _mm256_rsqrt_ps(a); // +Inf for +0, 0 for +Inf

// Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
__m256 x1 = _mm256_mul_ps(
x0,
_mm256_sub_ps(THREE_HALFS,
_mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));

// For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
// AVX2: native 256-bit integer compare
__m256i a_si = _mm256_castps_si256(a);
__m256i zero_mask = _mm256_cmpeq_epi32(a_si, _mm256_setzero_si256());
__m256i inf_mask = _mm256_cmpeq_epi32(a_si, _mm256_set1_epi32(0x7F800000));
__m256 special_mask = _mm256_castsi256_ps(_mm256_or_si256(zero_mask, inf_mask));
return _mm256_blendv_ps(x1, x0, special_mask);
}

static inline __m256 _mm256_real(const __m256 z1, const __m256 z2)
{
const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
Expand Down
28 changes: 27 additions & 1 deletion include/volk/volk_avx512_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* -*- c++ -*- */
/*
* Copyright 2024 Magnus Lundmark <[email protected]>
* Copyright 2024-2026 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand All @@ -16,6 +16,32 @@
#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#include <immintrin.h>

////////////////////////////////////////////////////////////////////////
// Newton-Raphson refined reciprocal square root: 1/sqrt(a)
// One iteration doubles precision from ~12-bit to ~24-bit
// x1 = x0 * (1.5 - 0.5 * a * x0^2)
// Handles edge cases: +0 → +Inf, +Inf → 0
// Requires AVX512F
////////////////////////////////////////////////////////////////////////
static inline __m512 _mm512_rsqrt_nr_ps(const __m512 a)
{
const __m512 HALF = _mm512_set1_ps(0.5f);
const __m512 THREE_HALFS = _mm512_set1_ps(1.5f);

const __m512 x0 = _mm512_rsqrt14_ps(a); // +Inf for +0, 0 for +Inf

// Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
__m512 x1 = _mm512_mul_ps(
x0, _mm512_fnmadd_ps(HALF, _mm512_mul_ps(_mm512_mul_ps(x0, x0), a), THREE_HALFS));

// For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
// Blend: use x0 where a == +0 or a == +Inf, else use x1
__m512i a_si = _mm512_castps_si512(a);
__mmask16 zero_mask = _mm512_cmpeq_epi32_mask(a_si, _mm512_setzero_si512());
__mmask16 inf_mask = _mm512_cmpeq_epi32_mask(a_si, _mm512_set1_epi32(0x7F800000));
return _mm512_mask_blend_ps(zero_mask | inf_mask, x1, x0);
}

////////////////////////////////////////////////////////////////////////
// Place real parts of two complex vectors in output
// Requires AVX512F
Expand Down
39 changes: 38 additions & 1 deletion include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <[email protected]>
* Copyright 2023-2026 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand All @@ -17,6 +17,43 @@
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
#include <immintrin.h>

/*
* Newton-Raphson refined reciprocal square root: 1/sqrt(a)
* One iteration doubles precision from ~12-bit to ~24-bit
* x1 = x0 * (1.5 - 0.5 * a * x0^2)
* Handles edge cases: +0 → +Inf, +Inf → 0
*/
static inline __m256 _mm256_rsqrt_nr_ps(const __m256 a)
{
const __m256 HALF = _mm256_set1_ps(0.5f);
const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);

const __m256 x0 = _mm256_rsqrt_ps(a); // +Inf for +0, 0 for +Inf

// Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
__m256 x1 = _mm256_mul_ps(
x0,
_mm256_sub_ps(THREE_HALFS,
_mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));

// For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
// Blend: use x0 where a == +0 or a == +Inf, else use x1
// AVX-only: use SSE2 integer compare, then reconstruct AVX mask
__m128i a_lo = _mm256_castsi256_si128(_mm256_castps_si256(a));
__m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
__m128i zero_si = _mm_setzero_si128();
__m128i inf_si = _mm_set1_epi32(0x7F800000);
__m128i zero_mask_lo = _mm_cmpeq_epi32(a_lo, zero_si);
__m128i zero_mask_hi = _mm_cmpeq_epi32(a_hi, zero_si);
__m128i inf_mask_lo = _mm_cmpeq_epi32(a_lo, inf_si);
__m128i inf_mask_hi = _mm_cmpeq_epi32(a_hi, inf_si);
__m128 mask_lo = _mm_castsi128_ps(_mm_or_si128(zero_mask_lo, inf_mask_lo));
__m128 mask_hi = _mm_castsi128_ps(_mm_or_si128(zero_mask_hi, inf_mask_hi));
__m256 special_mask =
_mm256_insertf128_ps(_mm256_castps128_ps256(mask_lo), mask_hi, 1);
return _mm256_blendv_ps(x1, x0, special_mask);
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
Expand Down
25 changes: 16 additions & 9 deletions include/volk/volk_neon_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2025 Magnus Lundmark <[email protected]>
* Copyright 2025, 2026 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand Down Expand Up @@ -80,16 +80,23 @@ static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
return result;
}

/* Inverse square root for float32x4_t */
/* Inverse square root for float32x4_t
* Handles edge cases: +0 → +Inf, +Inf → 0 */
static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
{
float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
sqrt_reciprocal = vmulq_f32(
vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
sqrt_reciprocal = vmulq_f32(
vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);

return sqrt_reciprocal;
float32x4_t x0 = vrsqrteq_f32(x); // +Inf for +0, 0 for +Inf

// Newton-Raphson refinement using vrsqrtsq_f32
float32x4_t x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x0), x0), x0);
x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x1), x1), x1);

// For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
// Blend: use x0 where x == +0 or x == +Inf, else use x1
uint32x4_t x_bits = vreinterpretq_u32_f32(x);
uint32x4_t zero_mask = vceqq_u32(x_bits, vdupq_n_u32(0x00000000));
uint32x4_t inf_mask = vceqq_u32(x_bits, vdupq_n_u32(0x7F800000));
uint32x4_t special_mask = vorrq_u32(zero_mask, inf_mask);
return vbslq_f32(special_mask, x0, x1);
}

/* Square root for ARMv7 NEON (no vsqrtq_f32)
Expand Down
30 changes: 29 additions & 1 deletion include/volk/volk_sse_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <[email protected]>
* Copyright 2023-2026 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand All @@ -15,8 +15,36 @@

#ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
#include <emmintrin.h>
#include <xmmintrin.h>

/*
* Newton-Raphson refined reciprocal square root: 1/sqrt(a)
* One iteration doubles precision from ~12-bit to ~24-bit
* x1 = x0 * (1.5 - 0.5 * a * x0^2)
* Handles edge cases: +0 → +Inf, +Inf → 0
*/
static inline __m128 _mm_rsqrt_nr_ps(const __m128 a)
{
const __m128 HALF = _mm_set1_ps(0.5f);
const __m128 THREE_HALFS = _mm_set1_ps(1.5f);

const __m128 x0 = _mm_rsqrt_ps(a); // +Inf for +0, 0 for +Inf

// Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
__m128 x1 = _mm_mul_ps(
x0, _mm_sub_ps(THREE_HALFS, _mm_mul_ps(HALF, _mm_mul_ps(_mm_mul_ps(x0, x0), a))));

// For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
// Blend: use x0 where a == +0 or a == +Inf, else use x1
__m128i a_si = _mm_castps_si128(a);
__m128i zero_mask = _mm_cmpeq_epi32(a_si, _mm_setzero_si128());
__m128i inf_mask = _mm_cmpeq_epi32(a_si, _mm_set1_epi32(0x7F800000));
__m128 special_mask = _mm_castsi128_ps(_mm_or_si128(zero_mask, inf_mask));
// SSE2-compatible blend: (x0 & mask) | (x1 & ~mask)
return _mm_or_ps(_mm_and_ps(special_mask, x0), _mm_andnot_ps(special_mask, x1));
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
Expand Down
Loading