gnuradio
diff --git a/‎include/volk/volk_avx2_fma_intrinsics.h‎
Lines changed: 41 additions & 1 deletion b/‎include/volk/volk_avx2_fma_intrinsics.h‎
Lines changed: 41 additions & 1 deletion
diff --git a/‎include/volk/volk_avx2_intrinsics.h‎
Lines changed: 42 additions & 0 deletions b/‎include/volk/volk_avx2_intrinsics.h‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎include/volk/volk_avx512_intrinsics.h‎
Lines changed: 42 additions & 0 deletions b/‎include/volk/volk_avx512_intrinsics.h‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎include/volk/volk_common.h‎
Lines changed: 96 additions & 1 deletion b/‎include/volk/volk_common.h‎
Lines changed: 96 additions & 1 deletion
diff --git a/‎include/volk/volk_neon_intrinsics.h‎
Lines changed: 70 additions & 12 deletions b/‎include/volk/volk_neon_intrinsics.h‎
Lines changed: 70 additions & 12 deletions
@@ -1,6 +1,6 @@
 /* -*- c++ -*- */
 /*
- * Copyright 2023 Magnus Lundmark <[email protected]>
+ * Copyright 2023 - 2025 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -70,4 +70,44 @@ static inline __m256 _mm256_arcsin_poly_avx2_fma(const __m256 x)
     return _mm256_mul_ps(x, p);
 }
 
+/*
+ * Minimax polynomial for sin(x) on [-pi/4, pi/4]
+ * Coefficients via Remez algorithm (Sollya)
+ * Max |error| < 7.3e-9
+ * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
+ */
+static inline __m256 _mm256_sin_poly_avx2_fma(const __m256 x)
+{
+    const __m256 s1 = _mm256_set1_ps(-0x1.555552p-3f);
+    const __m256 s2 = _mm256_set1_ps(+0x1.110be2p-7f);
+    const __m256 s3 = _mm256_set1_ps(-0x1.9ab22ap-13f);
+
+    const __m256 x2 = _mm256_mul_ps(x, x);
+    const __m256 x3 = _mm256_mul_ps(x2, x);
+
+    __m256 poly = _mm256_fmadd_ps(x2, s3, s2);
+    poly = _mm256_fmadd_ps(x2, poly, s1);
+    return _mm256_fmadd_ps(x3, poly, x);
+}
+
+/*
+ * Minimax polynomial for cos(x) on [-pi/4, pi/4]
+ * Coefficients via Remez algorithm (Sollya)
+ * Max |error| < 1.1e-7
+ * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
+ */
+static inline __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
+{
+    const __m256 c1 = _mm256_set1_ps(-0x1.fffff4p-2f);
+    const __m256 c2 = _mm256_set1_ps(+0x1.554a46p-5f);
+    const __m256 c3 = _mm256_set1_ps(-0x1.661be2p-10f);
+    const __m256 one = _mm256_set1_ps(1.0f);
+
+    const __m256 x2 = _mm256_mul_ps(x, x);
+
+    __m256 poly = _mm256_fmadd_ps(x2, c3, c2);
+    poly = _mm256_fmadd_ps(x2, poly, c1);
+    return _mm256_fmadd_ps(x2, poly, one);
+}
+
 #endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
@@ -345,4 +345,46 @@ static inline void vector_32fc_index_min_variant1(__m256 in0,
     *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
 }
 
+/*
+ * Approximate sin(x) via polynomial expansion
+ * on the interval [-pi/4, pi/4]
+ *
+ * Maximum absolute error ~7.3e-9
+ * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
+ */
+static inline __m256 _mm256_sin_poly_avx2(const __m256 x)
+{
+    const __m256 s1 = _mm256_set1_ps(-0x1.555552p-3f);
+    const __m256 s2 = _mm256_set1_ps(+0x1.110be2p-7f);
+    const __m256 s3 = _mm256_set1_ps(-0x1.9ab22ap-13f);
+
+    const __m256 x2 = _mm256_mul_ps(x, x);
+    const __m256 x3 = _mm256_mul_ps(x2, x);
+
+    __m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, s3), s2);
+    poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), s1);
+    return _mm256_add_ps(_mm256_mul_ps(x3, poly), x);
+}
+
+/*
+ * Approximate cos(x) via polynomial expansion
+ * on the interval [-pi/4, pi/4]
+ *
+ * Maximum absolute error ~1.1e-7
+ * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
+ */
+static inline __m256 _mm256_cos_poly_avx2(const __m256 x)
+{
+    const __m256 c1 = _mm256_set1_ps(-0x1.fffff4p-2f);
+    const __m256 c2 = _mm256_set1_ps(+0x1.554a46p-5f);
+    const __m256 c3 = _mm256_set1_ps(-0x1.661be2p-10f);
+    const __m256 one = _mm256_set1_ps(1.0f);
+
+    const __m256 x2 = _mm256_mul_ps(x, x);
+
+    __m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, c3), c2);
+    poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), c1);
+    return _mm256_add_ps(_mm256_mul_ps(x2, poly), one);
+}
+
 #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
@@ -170,4 +170,46 @@ static inline __m512 _mm512_normalize_ps(const __m512 val)
     return _mm512_div_ps(val, mag);
 }
 
+////////////////////////////////////////////////////////////////////////
+// Minimax polynomial for sin(x) on [-pi/4, pi/4]
+// Coefficients via Remez algorithm (Sollya)
+// Max |error| < 7.3e-9
+// sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
+static inline __m512 _mm512_sin_poly_avx512(const __m512 x)
+{
+    const __m512 s1 = _mm512_set1_ps(-0x1.555552p-3f);
+    const __m512 s2 = _mm512_set1_ps(+0x1.110be2p-7f);
+    const __m512 s3 = _mm512_set1_ps(-0x1.9ab22ap-13f);
+
+    const __m512 x2 = _mm512_mul_ps(x, x);
+    const __m512 x3 = _mm512_mul_ps(x2, x);
+
+    __m512 poly = _mm512_fmadd_ps(x2, s3, s2);
+    poly = _mm512_fmadd_ps(x2, poly, s1);
+    return _mm512_fmadd_ps(x3, poly, x);
+}
+
+////////////////////////////////////////////////////////////////////////
+// Minimax polynomial for cos(x) on [-pi/4, pi/4]
+// Coefficients via Remez algorithm (Sollya)
+// Max |error| < 1.1e-7
+// cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
+static inline __m512 _mm512_cos_poly_avx512(const __m512 x)
+{
+    const __m512 c1 = _mm512_set1_ps(-0x1.fffff4p-2f);
+    const __m512 c2 = _mm512_set1_ps(+0x1.554a46p-5f);
+    const __m512 c3 = _mm512_set1_ps(-0x1.661be2p-10f);
+    const __m512 one = _mm512_set1_ps(1.0f);
+
+    const __m512 x2 = _mm512_mul_ps(x, x);
+
+    __m512 poly = _mm512_fmadd_ps(x2, c3, c2);
+    poly = _mm512_fmadd_ps(x2, poly, c1);
+    return _mm512_fmadd_ps(x2, poly, one);
+}
+
 #endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <[email protected]>
+ * Copyright 2023 - 2025 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -200,6 +200,101 @@ static inline float volk_arctan_poly(const float x)
     return arctan;
 }
 ////////////////////////////////////////////////////////////////////////
+// sin(x) polynomial expansion
+////////////////////////////////////////////////////////////////////////
+static inline float volk_sin_poly(const float x)
+{
+    /*
+     * Minimax polynomial for sin(x) on [-pi/4, pi/4]
+     * Coefficients via Remez algorithm (Sollya)
+     * Max |error| < 7.3e-9
+     * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
+     */
+    const float s1 = -0x1.555552p-3f;
+    const float s2 = +0x1.110be2p-7f;
+    const float s3 = -0x1.9ab22ap-13f;
+
+    const float x2 = x * x;
+    const float x3 = x2 * x;
+
+    float poly = fmaf(x2, s3, s2);
+    poly = fmaf(x2, poly, s1);
+    return fmaf(x3, poly, x);
+}
+////////////////////////////////////////////////////////////////////////
+// cos(x) polynomial expansion
+////////////////////////////////////////////////////////////////////////
+static inline float volk_cos_poly(const float x)
+{
+    /*
+     * Minimax polynomial for cos(x) on [-pi/4, pi/4]
+     * Coefficients via Remez algorithm (Sollya)
+     * Max |error| < 1.1e-7
+     * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
+     */
+    const float c1 = -0x1.fffff4p-2f;
+    const float c2 = +0x1.554a46p-5f;
+    const float c3 = -0x1.661be2p-10f;
+
+    const float x2 = x * x;
+
+    float poly = fmaf(x2, c3, c2);
+    poly = fmaf(x2, poly, c1);
+    return fmaf(x2, poly, 1.0f);
+}
+////////////////////////////////////////////////////////////////////////
+// sin(x) with Cody-Waite argument reduction
+////////////////////////////////////////////////////////////////////////
+static inline float volk_sin(const float x)
+{
+    /*
+     * Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
+     * Then use sin/cos polynomials based on quadrant
+     */
+    const float two_over_pi = 0x1.45f306p-1f;
+    const float pi_over_2_hi = 0x1.921fb6p+0f;
+    const float pi_over_2_lo = -0x1.777a5cp-25f;
+
+    float n_f = rintf(x * two_over_pi);
+    int n = (int)n_f;
+
+    float r = fmaf(-n_f, pi_over_2_hi, x);
+    r = fmaf(-n_f, pi_over_2_lo, r);
+
+    float sin_r = volk_sin_poly(r);
+    float cos_r = volk_cos_poly(r);
+
+    // Quadrant selection: n&1 swaps sin/cos, n&2 negates
+    float result = (n & 1) ? cos_r : sin_r;
+    return (n & 2) ? -result : result;
+}
+////////////////////////////////////////////////////////////////////////
+// cos(x) with Cody-Waite argument reduction
+////////////////////////////////////////////////////////////////////////
+static inline float volk_cos(const float x)
+{
+    /*
+     * Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
+     * Then use sin/cos polynomials based on quadrant
+     */
+    const float two_over_pi = 0x1.45f306p-1f;
+    const float pi_over_2_hi = 0x1.921fb6p+0f;
+    const float pi_over_2_lo = -0x1.777a5cp-25f;
+
+    float n_f = rintf(x * two_over_pi);
+    int n = (int)n_f;
+
+    float r = fmaf(-n_f, pi_over_2_hi, x);
+    r = fmaf(-n_f, pi_over_2_lo, r);
+
+    float sin_r = volk_sin_poly(r);
+    float cos_r = volk_cos_poly(r);
+
+    // Quadrant selection: n&1 swaps sin/cos, (n+1)&2 negates
+    float result = (n & 1) ? sin_r : cos_r;
+    return ((n + 1) & 2) ? -result : result;
+}
+////////////////////////////////////////////////////////////////////////
 // arctan(x)
 ////////////////////////////////////////////////////////////////////////
 static inline float volk_arctan(const float x)
 
@@ -306,18 +306,6 @@ static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
     return sincos;
 }
 
-static inline float32x4_t _vsinq_f32(float32x4_t x)
-{
-    const float32x4x2_t sincos = _vsincosq_f32(x);
-    return sincos.val[0];
-}
-
-static inline float32x4_t _vcosq_f32(float32x4_t x)
-{
-    const float32x4x2_t sincos = _vsincosq_f32(x);
-    return sincos.val[1];
-}
-
 static inline float32x4_t _vtanq_f32(float32x4_t x)
 {
     const float32x4x2_t sincos = _vsincosq_f32(x);
@@ -372,6 +360,46 @@ static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
 #endif
 }
 
+/*
+ * Minimax polynomial for sin(x) on [-pi/4, pi/4]
+ * Coefficients via Remez algorithm (Sollya)
+ * Max |error| < 7.3e-9
+ * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
+ */
+static inline float32x4_t _vsin_poly_f32(float32x4_t x)
+{
+    const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
+    const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
+    const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
+
+    const float32x4_t x2 = vmulq_f32(x, x);
+    const float32x4_t x3 = vmulq_f32(x2, x);
+
+    float32x4_t poly = vmlaq_f32(s2, x2, s3);
+    poly = vmlaq_f32(s1, x2, poly);
+    return vmlaq_f32(x, x3, poly);
+}
+
+/*
+ * Minimax polynomial for cos(x) on [-pi/4, pi/4]
+ * Coefficients via Remez algorithm (Sollya)
+ * Max |error| < 1.1e-7
+ * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
+ */
+static inline float32x4_t _vcos_poly_f32(float32x4_t x)
+{
+    const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
+    const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
+    const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
+    const float32x4_t one = vdupq_n_f32(1.0f);
+
+    const float32x4_t x2 = vmulq_f32(x, x);
+
+    float32x4_t poly = vmlaq_f32(c2, x2, c3);
+    poly = vmlaq_f32(c1, x2, poly);
+    return vmlaq_f32(one, x2, poly);
+}
+
 #ifdef LV_HAVE_NEONV8
 /* ARMv8 NEON FMA-based arctan polynomial for better accuracy and throughput */
 static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
@@ -396,6 +424,36 @@ static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
 
     return result;
 }
+
+/* NEONv8 FMA sin polynomial on [-pi/4, pi/4], coeffs via Remez (Sollya) */
+static inline float32x4_t _vsin_poly_neonv8(float32x4_t x)
+{
+    const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
+    const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
+    const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
+
+    const float32x4_t x2 = vmulq_f32(x, x);
+    const float32x4_t x3 = vmulq_f32(x2, x);
+
+    float32x4_t poly = vfmaq_f32(s2, x2, s3);
+    poly = vfmaq_f32(s1, x2, poly);
+    return vfmaq_f32(x, x3, poly);
+}
+
+/* NEONv8 FMA cos polynomial on [-pi/4, pi/4], coeffs via Remez (Sollya) */
+static inline float32x4_t _vcos_poly_neonv8(float32x4_t x)
+{
+    const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
+    const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
+    const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
+    const float32x4_t one = vdupq_n_f32(1.0f);
+
+    const float32x4_t x2 = vmulq_f32(x, x);
+
+    float32x4_t poly = vfmaq_f32(c2, x2, c3);
+    poly = vfmaq_f32(c1, x2, poly);
+    return vfmaq_f32(one, x2, poly);
+}
 #endif /* LV_HAVE_NEONV8 */
 
 #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */