Skip to content

Commit 837e4b2

Browse files
authored
Merge pull request #806 from Ka-zam/sincos_poly_pr
Polynomial-based sin/cos/sincos kernels
2 parents 34543dc + 1ba3857 commit 837e4b2

File tree

10 files changed

+2119
-1507
lines changed

10 files changed

+2119
-1507
lines changed

include/volk/volk_avx2_fma_intrinsics.h

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- c++ -*- */
22
/*
3-
* Copyright 2023 Magnus Lundmark <[email protected]>
3+
* Copyright 2023 - 2025 Magnus Lundmark <[email protected]>
44
*
55
* This file is part of VOLK
66
*
@@ -70,4 +70,44 @@ static inline __m256 _mm256_arcsin_poly_avx2_fma(const __m256 x)
7070
return _mm256_mul_ps(x, p);
7171
}
7272

73+
/*
74+
* Minimax polynomial for sin(x) on [-pi/4, pi/4]
75+
* Coefficients via Remez algorithm (Sollya)
76+
* Max |error| < 7.3e-9
77+
* sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
78+
*/
79+
static inline __m256 _mm256_sin_poly_avx2_fma(const __m256 x)
80+
{
81+
const __m256 s1 = _mm256_set1_ps(-0x1.555552p-3f);
82+
const __m256 s2 = _mm256_set1_ps(+0x1.110be2p-7f);
83+
const __m256 s3 = _mm256_set1_ps(-0x1.9ab22ap-13f);
84+
85+
const __m256 x2 = _mm256_mul_ps(x, x);
86+
const __m256 x3 = _mm256_mul_ps(x2, x);
87+
88+
__m256 poly = _mm256_fmadd_ps(x2, s3, s2);
89+
poly = _mm256_fmadd_ps(x2, poly, s1);
90+
return _mm256_fmadd_ps(x3, poly, x);
91+
}
92+
93+
/*
94+
* Minimax polynomial for cos(x) on [-pi/4, pi/4]
95+
* Coefficients via Remez algorithm (Sollya)
96+
* Max |error| < 1.1e-7
97+
* cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
98+
*/
99+
static inline __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
100+
{
101+
const __m256 c1 = _mm256_set1_ps(-0x1.fffff4p-2f);
102+
const __m256 c2 = _mm256_set1_ps(+0x1.554a46p-5f);
103+
const __m256 c3 = _mm256_set1_ps(-0x1.661be2p-10f);
104+
const __m256 one = _mm256_set1_ps(1.0f);
105+
106+
const __m256 x2 = _mm256_mul_ps(x, x);
107+
108+
__m256 poly = _mm256_fmadd_ps(x2, c3, c2);
109+
poly = _mm256_fmadd_ps(x2, poly, c1);
110+
return _mm256_fmadd_ps(x2, poly, one);
111+
}
112+
73113
#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */

include/volk/volk_avx2_intrinsics.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,4 +345,46 @@ static inline void vector_32fc_index_min_variant1(__m256 in0,
345345
*current_indices = _mm256_add_epi32(*current_indices, indices_increment);
346346
}
347347

348+
/*
349+
* Approximate sin(x) via polynomial expansion
350+
* on the interval [-pi/4, pi/4]
351+
*
352+
* Maximum absolute error ~7.3e-9
353+
* sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
354+
*/
355+
static inline __m256 _mm256_sin_poly_avx2(const __m256 x)
356+
{
357+
const __m256 s1 = _mm256_set1_ps(-0x1.555552p-3f);
358+
const __m256 s2 = _mm256_set1_ps(+0x1.110be2p-7f);
359+
const __m256 s3 = _mm256_set1_ps(-0x1.9ab22ap-13f);
360+
361+
const __m256 x2 = _mm256_mul_ps(x, x);
362+
const __m256 x3 = _mm256_mul_ps(x2, x);
363+
364+
__m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, s3), s2);
365+
poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), s1);
366+
return _mm256_add_ps(_mm256_mul_ps(x3, poly), x);
367+
}
368+
369+
/*
370+
* Approximate cos(x) via polynomial expansion
371+
* on the interval [-pi/4, pi/4]
372+
*
373+
* Maximum absolute error ~1.1e-7
374+
* cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
375+
*/
376+
static inline __m256 _mm256_cos_poly_avx2(const __m256 x)
377+
{
378+
const __m256 c1 = _mm256_set1_ps(-0x1.fffff4p-2f);
379+
const __m256 c2 = _mm256_set1_ps(+0x1.554a46p-5f);
380+
const __m256 c3 = _mm256_set1_ps(-0x1.661be2p-10f);
381+
const __m256 one = _mm256_set1_ps(1.0f);
382+
383+
const __m256 x2 = _mm256_mul_ps(x, x);
384+
385+
__m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, c3), c2);
386+
poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), c1);
387+
return _mm256_add_ps(_mm256_mul_ps(x2, poly), one);
388+
}
389+
348390
#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */

include/volk/volk_avx512_intrinsics.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,4 +170,46 @@ static inline __m512 _mm512_normalize_ps(const __m512 val)
170170
return _mm512_div_ps(val, mag);
171171
}
172172

173+
////////////////////////////////////////////////////////////////////////
174+
// Minimax polynomial for sin(x) on [-pi/4, pi/4]
175+
// Coefficients via Remez algorithm (Sollya)
176+
// Max |error| < 7.3e-9
177+
// sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
178+
// Requires AVX512F
179+
////////////////////////////////////////////////////////////////////////
180+
static inline __m512 _mm512_sin_poly_avx512(const __m512 x)
181+
{
182+
const __m512 s1 = _mm512_set1_ps(-0x1.555552p-3f);
183+
const __m512 s2 = _mm512_set1_ps(+0x1.110be2p-7f);
184+
const __m512 s3 = _mm512_set1_ps(-0x1.9ab22ap-13f);
185+
186+
const __m512 x2 = _mm512_mul_ps(x, x);
187+
const __m512 x3 = _mm512_mul_ps(x2, x);
188+
189+
__m512 poly = _mm512_fmadd_ps(x2, s3, s2);
190+
poly = _mm512_fmadd_ps(x2, poly, s1);
191+
return _mm512_fmadd_ps(x3, poly, x);
192+
}
193+
194+
////////////////////////////////////////////////////////////////////////
195+
// Minimax polynomial for cos(x) on [-pi/4, pi/4]
196+
// Coefficients via Remez algorithm (Sollya)
197+
// Max |error| < 1.1e-7
198+
// cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
199+
// Requires AVX512F
200+
////////////////////////////////////////////////////////////////////////
201+
static inline __m512 _mm512_cos_poly_avx512(const __m512 x)
202+
{
203+
const __m512 c1 = _mm512_set1_ps(-0x1.fffff4p-2f);
204+
const __m512 c2 = _mm512_set1_ps(+0x1.554a46p-5f);
205+
const __m512 c3 = _mm512_set1_ps(-0x1.661be2p-10f);
206+
const __m512 one = _mm512_set1_ps(1.0f);
207+
208+
const __m512 x2 = _mm512_mul_ps(x, x);
209+
210+
__m512 poly = _mm512_fmadd_ps(x2, c3, c2);
211+
poly = _mm512_fmadd_ps(x2, poly, c1);
212+
return _mm512_fmadd_ps(x2, poly, one);
213+
}
214+
173215
#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */

include/volk/volk_common.h

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* -*- c++ -*- */
22
/*
33
* Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
4-
* Copyright 2023 Magnus Lundmark <[email protected]>
4+
* Copyright 2023 - 2025 Magnus Lundmark <[email protected]>
55
*
66
* This file is part of VOLK
77
*
@@ -200,6 +200,101 @@ static inline float volk_arctan_poly(const float x)
200200
return arctan;
201201
}
202202
////////////////////////////////////////////////////////////////////////
203+
// sin(x) polynomial expansion
204+
////////////////////////////////////////////////////////////////////////
205+
static inline float volk_sin_poly(const float x)
206+
{
207+
/*
208+
* Minimax polynomial for sin(x) on [-pi/4, pi/4]
209+
* Coefficients via Remez algorithm (Sollya)
210+
* Max |error| < 7.3e-9
211+
* sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
212+
*/
213+
const float s1 = -0x1.555552p-3f;
214+
const float s2 = +0x1.110be2p-7f;
215+
const float s3 = -0x1.9ab22ap-13f;
216+
217+
const float x2 = x * x;
218+
const float x3 = x2 * x;
219+
220+
float poly = fmaf(x2, s3, s2);
221+
poly = fmaf(x2, poly, s1);
222+
return fmaf(x3, poly, x);
223+
}
224+
////////////////////////////////////////////////////////////////////////
225+
// cos(x) polynomial expansion
226+
////////////////////////////////////////////////////////////////////////
227+
static inline float volk_cos_poly(const float x)
228+
{
229+
/*
230+
* Minimax polynomial for cos(x) on [-pi/4, pi/4]
231+
* Coefficients via Remez algorithm (Sollya)
232+
* Max |error| < 1.1e-7
233+
* cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
234+
*/
235+
const float c1 = -0x1.fffff4p-2f;
236+
const float c2 = +0x1.554a46p-5f;
237+
const float c3 = -0x1.661be2p-10f;
238+
239+
const float x2 = x * x;
240+
241+
float poly = fmaf(x2, c3, c2);
242+
poly = fmaf(x2, poly, c1);
243+
return fmaf(x2, poly, 1.0f);
244+
}
245+
////////////////////////////////////////////////////////////////////////
246+
// sin(x) with Cody-Waite argument reduction
247+
////////////////////////////////////////////////////////////////////////
248+
static inline float volk_sin(const float x)
249+
{
250+
/*
251+
* Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
252+
* Then use sin/cos polynomials based on quadrant
253+
*/
254+
const float two_over_pi = 0x1.45f306p-1f;
255+
const float pi_over_2_hi = 0x1.921fb6p+0f;
256+
const float pi_over_2_lo = -0x1.777a5cp-25f;
257+
258+
float n_f = rintf(x * two_over_pi);
259+
int n = (int)n_f;
260+
261+
float r = fmaf(-n_f, pi_over_2_hi, x);
262+
r = fmaf(-n_f, pi_over_2_lo, r);
263+
264+
float sin_r = volk_sin_poly(r);
265+
float cos_r = volk_cos_poly(r);
266+
267+
// Quadrant selection: n&1 swaps sin/cos, n&2 negates
268+
float result = (n & 1) ? cos_r : sin_r;
269+
return (n & 2) ? -result : result;
270+
}
271+
////////////////////////////////////////////////////////////////////////
272+
// cos(x) with Cody-Waite argument reduction
273+
////////////////////////////////////////////////////////////////////////
274+
static inline float volk_cos(const float x)
275+
{
276+
/*
277+
* Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
278+
* Then use sin/cos polynomials based on quadrant
279+
*/
280+
const float two_over_pi = 0x1.45f306p-1f;
281+
const float pi_over_2_hi = 0x1.921fb6p+0f;
282+
const float pi_over_2_lo = -0x1.777a5cp-25f;
283+
284+
float n_f = rintf(x * two_over_pi);
285+
int n = (int)n_f;
286+
287+
float r = fmaf(-n_f, pi_over_2_hi, x);
288+
r = fmaf(-n_f, pi_over_2_lo, r);
289+
290+
float sin_r = volk_sin_poly(r);
291+
float cos_r = volk_cos_poly(r);
292+
293+
// Quadrant selection: n&1 swaps sin/cos, (n+1)&2 negates
294+
float result = (n & 1) ? sin_r : cos_r;
295+
return ((n + 1) & 2) ? -result : result;
296+
}
297+
////////////////////////////////////////////////////////////////////////
203298
// arctan(x)
204299
////////////////////////////////////////////////////////////////////////
205300
static inline float volk_arctan(const float x)

include/volk/volk_neon_intrinsics.h

Lines changed: 70 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -306,18 +306,6 @@ static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
306306
return sincos;
307307
}
308308

309-
static inline float32x4_t _vsinq_f32(float32x4_t x)
310-
{
311-
const float32x4x2_t sincos = _vsincosq_f32(x);
312-
return sincos.val[0];
313-
}
314-
315-
static inline float32x4_t _vcosq_f32(float32x4_t x)
316-
{
317-
const float32x4x2_t sincos = _vsincosq_f32(x);
318-
return sincos.val[1];
319-
}
320-
321309
static inline float32x4_t _vtanq_f32(float32x4_t x)
322310
{
323311
const float32x4x2_t sincos = _vsincosq_f32(x);
@@ -372,6 +360,46 @@ static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
372360
#endif
373361
}
374362

363+
/*
364+
* Minimax polynomial for sin(x) on [-pi/4, pi/4]
365+
* Coefficients via Remez algorithm (Sollya)
366+
* Max |error| < 7.3e-9
367+
* sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
368+
*/
369+
static inline float32x4_t _vsin_poly_f32(float32x4_t x)
370+
{
371+
const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
372+
const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
373+
const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
374+
375+
const float32x4_t x2 = vmulq_f32(x, x);
376+
const float32x4_t x3 = vmulq_f32(x2, x);
377+
378+
float32x4_t poly = vmlaq_f32(s2, x2, s3);
379+
poly = vmlaq_f32(s1, x2, poly);
380+
return vmlaq_f32(x, x3, poly);
381+
}
382+
383+
/*
384+
* Minimax polynomial for cos(x) on [-pi/4, pi/4]
385+
* Coefficients via Remez algorithm (Sollya)
386+
* Max |error| < 1.1e-7
387+
* cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
388+
*/
389+
static inline float32x4_t _vcos_poly_f32(float32x4_t x)
390+
{
391+
const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
392+
const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
393+
const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
394+
const float32x4_t one = vdupq_n_f32(1.0f);
395+
396+
const float32x4_t x2 = vmulq_f32(x, x);
397+
398+
float32x4_t poly = vmlaq_f32(c2, x2, c3);
399+
poly = vmlaq_f32(c1, x2, poly);
400+
return vmlaq_f32(one, x2, poly);
401+
}
402+
375403
#ifdef LV_HAVE_NEONV8
376404
/* ARMv8 NEON FMA-based arctan polynomial for better accuracy and throughput */
377405
static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
@@ -396,6 +424,36 @@ static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
396424

397425
return result;
398426
}
427+
428+
/* NEONv8 FMA sin polynomial on [-pi/4, pi/4], coeffs via Remez (Sollya) */
429+
static inline float32x4_t _vsin_poly_neonv8(float32x4_t x)
430+
{
431+
const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
432+
const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
433+
const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
434+
435+
const float32x4_t x2 = vmulq_f32(x, x);
436+
const float32x4_t x3 = vmulq_f32(x2, x);
437+
438+
float32x4_t poly = vfmaq_f32(s2, x2, s3);
439+
poly = vfmaq_f32(s1, x2, poly);
440+
return vfmaq_f32(x, x3, poly);
441+
}
442+
443+
/* NEONv8 FMA cos polynomial on [-pi/4, pi/4], coeffs via Remez (Sollya) */
444+
static inline float32x4_t _vcos_poly_neonv8(float32x4_t x)
445+
{
446+
const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
447+
const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
448+
const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
449+
const float32x4_t one = vdupq_n_f32(1.0f);
450+
451+
const float32x4_t x2 = vmulq_f32(x, x);
452+
453+
float32x4_t poly = vfmaq_f32(c2, x2, c3);
454+
poly = vfmaq_f32(c1, x2, poly);
455+
return vfmaq_f32(one, x2, poly);
456+
}
399457
#endif /* LV_HAVE_NEONV8 */
400458

401459
#endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */

0 commit comments

Comments
 (0)