Skip to content

Commit 3cf2f53

Browse files
authored
Merge pull request #812 from Ka-zam/log2_edge_cases
Log2 edge cases
2 parents 72e5450 + 0a56038 commit 3cf2f53

10 files changed

+696
-889
lines changed

include/volk/volk_avx2_fma_intrinsics.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,33 @@ static inline __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
110110
return _mm256_fmadd_ps(x2, poly, one);
111111
}
112112

113+
/*
114+
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
115+
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
116+
* Max error: ~1.55e-6
117+
*
118+
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
119+
* Polynomial evaluated via Horner's method with FMA
120+
*/
121+
static inline __m256 _mm256_log2_poly_avx2_fma(const __m256 x)
122+
{
123+
const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
124+
const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
125+
const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
126+
const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
127+
const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
128+
const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
129+
const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
130+
131+
// Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
132+
__m256 poly = c6;
133+
poly = _mm256_fmadd_ps(poly, x, c5);
134+
poly = _mm256_fmadd_ps(poly, x, c4);
135+
poly = _mm256_fmadd_ps(poly, x, c3);
136+
poly = _mm256_fmadd_ps(poly, x, c2);
137+
poly = _mm256_fmadd_ps(poly, x, c1);
138+
poly = _mm256_fmadd_ps(poly, x, c0);
139+
return poly;
140+
}
141+
113142
#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */

include/volk/volk_avx2_intrinsics.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,4 +414,33 @@ static inline __m256 _mm256_cos_poly_avx2(const __m256 x)
414414
return _mm256_add_ps(_mm256_mul_ps(x2, poly), one);
415415
}
416416

417+
/*
418+
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
419+
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
420+
* Max error: ~1.55e-6
421+
*
422+
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
423+
* Polynomial evaluated via Horner's method
424+
*/
425+
static inline __m256 _mm256_log2_poly_avx2(const __m256 x)
426+
{
427+
const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
428+
const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
429+
const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
430+
const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
431+
const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
432+
const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
433+
const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
434+
435+
// Horner's method: c0 + x*(c1 + x*(c2 + ...))
436+
__m256 poly = c6;
437+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
438+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
439+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
440+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
441+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
442+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);
443+
return poly;
444+
}
445+
417446
#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */

include/volk/volk_avx512_intrinsics.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,4 +238,34 @@ static inline __m512 _mm512_cos_poly_avx512(const __m512 x)
238238
return _mm512_fmadd_ps(x2, poly, one);
239239
}
240240

241+
////////////////////////////////////////////////////////////////////////
242+
// Polynomial coefficients for log2(x)/(x-1) on [1, 2]
243+
// Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
244+
// Max error: ~1.55e-6
245+
//
246+
// Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
247+
// Polynomial evaluated via Horner's method with FMA
248+
// Requires AVX512F
249+
////////////////////////////////////////////////////////////////////////
250+
static inline __m512 _mm512_log2_poly_avx512(const __m512 x)
251+
{
252+
const __m512 c0 = _mm512_set1_ps(+0x1.a8a726p+1f);
253+
const __m512 c1 = _mm512_set1_ps(-0x1.0b7f7ep+2f);
254+
const __m512 c2 = _mm512_set1_ps(+0x1.05d9ccp+2f);
255+
const __m512 c3 = _mm512_set1_ps(-0x1.4d476cp+1f);
256+
const __m512 c4 = _mm512_set1_ps(+0x1.04fc3ap+0f);
257+
const __m512 c5 = _mm512_set1_ps(-0x1.c97982p-3f);
258+
const __m512 c6 = _mm512_set1_ps(+0x1.57aa42p-6f);
259+
260+
// Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
261+
__m512 poly = c6;
262+
poly = _mm512_fmadd_ps(poly, x, c5);
263+
poly = _mm512_fmadd_ps(poly, x, c4);
264+
poly = _mm512_fmadd_ps(poly, x, c3);
265+
poly = _mm512_fmadd_ps(poly, x, c2);
266+
poly = _mm512_fmadd_ps(poly, x, c1);
267+
poly = _mm512_fmadd_ps(poly, x, c0);
268+
return poly;
269+
}
270+
241271
#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */

include/volk/volk_avx_intrinsics.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,4 +293,33 @@ static inline __m256 _mm256_accumulate_square_sum_ps(
293293
return _mm256_add_ps(sq_acc, aux);
294294
}
295295

296+
/*
297+
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
298+
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
299+
* Max error: ~1.55e-6
300+
*
301+
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
302+
* Polynomial evaluated via Horner's method
303+
*/
304+
static inline __m256 _mm256_log2_poly_avx(const __m256 x)
305+
{
306+
const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
307+
const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
308+
const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
309+
const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
310+
const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
311+
const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
312+
const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
313+
314+
// Horner's method: c0 + x*(c1 + x*(c2 + ...))
315+
__m256 poly = c6;
316+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
317+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
318+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
319+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
320+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
321+
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);
322+
return poly;
323+
}
324+
296325
#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */

include/volk/volk_neon_intrinsics.h

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,35 @@ static inline float32x4_t _vcos_poly_f32(float32x4_t x)
407407
return vmlaq_f32(one, x2, poly);
408408
}
409409

410+
/*
411+
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
412+
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
413+
* Max error: ~1.55e-6
414+
*
415+
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
416+
* Polynomial evaluated via Horner's method
417+
*/
418+
static inline float32x4_t _vlog2_poly_f32(float32x4_t x)
419+
{
420+
const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
421+
const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
422+
const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
423+
const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
424+
const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
425+
const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
426+
const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
427+
428+
// Horner's method: c0 + x*(c1 + x*(c2 + ...))
429+
float32x4_t poly = c6;
430+
poly = vmlaq_f32(c5, poly, x);
431+
poly = vmlaq_f32(c4, poly, x);
432+
poly = vmlaq_f32(c3, poly, x);
433+
poly = vmlaq_f32(c2, poly, x);
434+
poly = vmlaq_f32(c1, poly, x);
435+
poly = vmlaq_f32(c0, poly, x);
436+
return poly;
437+
}
438+
410439
#ifdef LV_HAVE_NEONV8
411440
/* ARMv8 NEON FMA-based arctan polynomial for better accuracy and throughput */
412441
static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
@@ -461,6 +490,32 @@ static inline float32x4_t _vcos_poly_neonv8(float32x4_t x)
461490
poly = vfmaq_f32(c1, x2, poly);
462491
return vfmaq_f32(one, x2, poly);
463492
}
493+
494+
/*
495+
* NEONv8 FMA log2 polynomial on [1, 2]
496+
* log2(x) ≈ poly(x) * (x - 1)
497+
* Max error: ~1.55e-6
498+
*/
499+
static inline float32x4_t _vlog2_poly_neonv8(float32x4_t x)
500+
{
501+
const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
502+
const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
503+
const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
504+
const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
505+
const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
506+
const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
507+
const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
508+
509+
// Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
510+
float32x4_t poly = c6;
511+
poly = vfmaq_f32(c5, poly, x);
512+
poly = vfmaq_f32(c4, poly, x);
513+
poly = vfmaq_f32(c3, poly, x);
514+
poly = vfmaq_f32(c2, poly, x);
515+
poly = vfmaq_f32(c1, poly, x);
516+
poly = vfmaq_f32(c0, poly, x);
517+
return poly;
518+
}
464519
#endif /* LV_HAVE_NEONV8 */
465520

466521
#endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */

include/volk/volk_rvv_intrinsics.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,39 @@
7474

7575
#define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl)
7676

77+
/*
78+
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
79+
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
80+
* Max error: ~1.55e-6
81+
*
82+
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
83+
* Polynomial evaluated via Horner's method with FMA
84+
*
85+
* Parameters:
86+
* x: mantissa values in [1, 2)
87+
* vl: vector length for operations
88+
* vlmax: maximum vector length used for creating coefficient vectors
89+
*/
90+
static inline vfloat32m2_t
91+
__riscv_vlog2_poly_f32m2(vfloat32m2_t x, size_t vl, size_t vlmax)
92+
{
93+
const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(+0x1.a8a726p+1f, vlmax);
94+
const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0x1.0b7f7ep+2f, vlmax);
95+
const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(+0x1.05d9ccp+2f, vlmax);
96+
const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.4d476cp+1f, vlmax);
97+
const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(+0x1.04fc3ap+0f, vlmax);
98+
const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(-0x1.c97982p-3f, vlmax);
99+
const vfloat32m2_t c6 = __riscv_vfmv_v_f_f32m2(+0x1.57aa42p-6f, vlmax);
100+
101+
// Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
102+
vfloat32m2_t poly = c6;
103+
poly = __riscv_vfmadd(poly, x, c5, vl);
104+
poly = __riscv_vfmadd(poly, x, c4, vl);
105+
poly = __riscv_vfmadd(poly, x, c3, vl);
106+
poly = __riscv_vfmadd(poly, x, c2, vl);
107+
poly = __riscv_vfmadd(poly, x, c1, vl);
108+
poly = __riscv_vfmadd(poly, x, c0, vl);
109+
return poly;
110+
}
111+
77112
#endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */

include/volk/volk_sse_intrinsics.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,4 +188,33 @@ static inline __m128 _mm_cos_poly_sse(const __m128 x)
188188
return _mm_add_ps(_mm_mul_ps(x2, poly), one);
189189
}
190190

191+
/*
192+
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
193+
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
194+
* Max error: ~1.55e-6
195+
*
196+
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
197+
* Polynomial evaluated via Horner's method
198+
*/
199+
static inline __m128 _mm_log2_poly_sse(const __m128 x)
200+
{
201+
const __m128 c0 = _mm_set1_ps(+0x1.a8a726p+1f);
202+
const __m128 c1 = _mm_set1_ps(-0x1.0b7f7ep+2f);
203+
const __m128 c2 = _mm_set1_ps(+0x1.05d9ccp+2f);
204+
const __m128 c3 = _mm_set1_ps(-0x1.4d476cp+1f);
205+
const __m128 c4 = _mm_set1_ps(+0x1.04fc3ap+0f);
206+
const __m128 c5 = _mm_set1_ps(-0x1.c97982p-3f);
207+
const __m128 c6 = _mm_set1_ps(+0x1.57aa42p-6f);
208+
209+
// Horner's method: c0 + x*(c1 + x*(c2 + ...))
210+
__m128 poly = c6;
211+
poly = _mm_add_ps(_mm_mul_ps(poly, x), c5);
212+
poly = _mm_add_ps(_mm_mul_ps(poly, x), c4);
213+
poly = _mm_add_ps(_mm_mul_ps(poly, x), c3);
214+
poly = _mm_add_ps(_mm_mul_ps(poly, x), c2);
215+
poly = _mm_add_ps(_mm_mul_ps(poly, x), c1);
216+
poly = _mm_add_ps(_mm_mul_ps(poly, x), c0);
217+
return poly;
218+
}
219+
191220
#endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */

0 commit comments

Comments
 (0)