Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,33 @@ static inline __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
return _mm256_fmadd_ps(x2, poly, one);
}

/*
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
* Max error: ~1.55e-6
*
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
* Polynomial evaluated via Horner's method with FMA
*/
static inline __m256 _mm256_log2_poly_avx2_fma(const __m256 x)
{
const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);

// Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
__m256 poly = c6;
poly = _mm256_fmadd_ps(poly, x, c5);
poly = _mm256_fmadd_ps(poly, x, c4);
poly = _mm256_fmadd_ps(poly, x, c3);
poly = _mm256_fmadd_ps(poly, x, c2);
poly = _mm256_fmadd_ps(poly, x, c1);
poly = _mm256_fmadd_ps(poly, x, c0);
return poly;
}

#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
29 changes: 29 additions & 0 deletions include/volk/volk_avx2_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -387,4 +387,33 @@ static inline __m256 _mm256_cos_poly_avx2(const __m256 x)
return _mm256_add_ps(_mm256_mul_ps(x2, poly), one);
}

/*
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
* Max error: ~1.55e-6
*
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
* Polynomial evaluated via Horner's method
*/
static inline __m256 _mm256_log2_poly_avx2(const __m256 x)
{
const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);

// Horner's method: c0 + x*(c1 + x*(c2 + ...))
__m256 poly = c6;
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);
return poly;
}

#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
30 changes: 30 additions & 0 deletions include/volk/volk_avx512_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,4 +212,34 @@ static inline __m512 _mm512_cos_poly_avx512(const __m512 x)
return _mm512_fmadd_ps(x2, poly, one);
}

////////////////////////////////////////////////////////////////////////
// Polynomial coefficients for log2(x)/(x-1) on [1, 2]
// Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
// Max error: ~1.55e-6
//
// Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
// Polynomial evaluated via Horner's method with FMA
// Requires AVX512F
////////////////////////////////////////////////////////////////////////
static inline __m512 _mm512_log2_poly_avx512(const __m512 x)
{
const __m512 c0 = _mm512_set1_ps(+0x1.a8a726p+1f);
const __m512 c1 = _mm512_set1_ps(-0x1.0b7f7ep+2f);
const __m512 c2 = _mm512_set1_ps(+0x1.05d9ccp+2f);
const __m512 c3 = _mm512_set1_ps(-0x1.4d476cp+1f);
const __m512 c4 = _mm512_set1_ps(+0x1.04fc3ap+0f);
const __m512 c5 = _mm512_set1_ps(-0x1.c97982p-3f);
const __m512 c6 = _mm512_set1_ps(+0x1.57aa42p-6f);

// Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
__m512 poly = c6;
poly = _mm512_fmadd_ps(poly, x, c5);
poly = _mm512_fmadd_ps(poly, x, c4);
poly = _mm512_fmadd_ps(poly, x, c3);
poly = _mm512_fmadd_ps(poly, x, c2);
poly = _mm512_fmadd_ps(poly, x, c1);
poly = _mm512_fmadd_ps(poly, x, c0);
return poly;
}

#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
29 changes: 29 additions & 0 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,4 +256,33 @@ static inline __m256 _mm256_accumulate_square_sum_ps(
return _mm256_add_ps(sq_acc, aux);
}

/*
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
* Max error: ~1.55e-6
*
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
* Polynomial evaluated via Horner's method
*/
static inline __m256 _mm256_log2_poly_avx(const __m256 x)
{
const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);

// Horner's method: c0 + x*(c1 + x*(c2 + ...))
__m256 poly = c6;
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);
return poly;
}

#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
55 changes: 55 additions & 0 deletions include/volk/volk_neon_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,35 @@ static inline float32x4_t _vcos_poly_f32(float32x4_t x)
return vmlaq_f32(one, x2, poly);
}

/*
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
* Max error: ~1.55e-6
*
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
* Polynomial evaluated via Horner's method
*/
static inline float32x4_t _vlog2_poly_f32(float32x4_t x)
{
const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);

// Horner's method: c0 + x*(c1 + x*(c2 + ...))
float32x4_t poly = c6;
poly = vmlaq_f32(c5, poly, x);
poly = vmlaq_f32(c4, poly, x);
poly = vmlaq_f32(c3, poly, x);
poly = vmlaq_f32(c2, poly, x);
poly = vmlaq_f32(c1, poly, x);
poly = vmlaq_f32(c0, poly, x);
return poly;
}

#ifdef LV_HAVE_NEONV8
/* ARMv8 NEON FMA-based arctan polynomial for better accuracy and throughput */
static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
Expand Down Expand Up @@ -454,6 +483,32 @@ static inline float32x4_t _vcos_poly_neonv8(float32x4_t x)
poly = vfmaq_f32(c1, x2, poly);
return vfmaq_f32(one, x2, poly);
}

/*
* NEONv8 FMA log2 polynomial on [1, 2]
* log2(x) ≈ poly(x) * (x - 1)
* Max error: ~1.55e-6
*/
static inline float32x4_t _vlog2_poly_neonv8(float32x4_t x)
{
const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);

// Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
float32x4_t poly = c6;
poly = vfmaq_f32(c5, poly, x);
poly = vfmaq_f32(c4, poly, x);
poly = vfmaq_f32(c3, poly, x);
poly = vfmaq_f32(c2, poly, x);
poly = vfmaq_f32(c1, poly, x);
poly = vfmaq_f32(c0, poly, x);
return poly;
}
#endif /* LV_HAVE_NEONV8 */

#endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
35 changes: 35 additions & 0 deletions include/volk/volk_rvv_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,39 @@

#define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl)

/*
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
* Max error: ~1.55e-6
*
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
* Polynomial evaluated via Horner's method with FMA
*
* Parameters:
* x: mantissa values in [1, 2)
* vl: vector length for operations
* vlmax: maximum vector length used for creating coefficient vectors
*/
static inline vfloat32m2_t
__riscv_vlog2_poly_f32m2(vfloat32m2_t x, size_t vl, size_t vlmax)
{
const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(+0x1.a8a726p+1f, vlmax);
const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0x1.0b7f7ep+2f, vlmax);
const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(+0x1.05d9ccp+2f, vlmax);
const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.4d476cp+1f, vlmax);
const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(+0x1.04fc3ap+0f, vlmax);
const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(-0x1.c97982p-3f, vlmax);
const vfloat32m2_t c6 = __riscv_vfmv_v_f_f32m2(+0x1.57aa42p-6f, vlmax);

// Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
vfloat32m2_t poly = c6;
poly = __riscv_vfmadd(poly, x, c5, vl);
poly = __riscv_vfmadd(poly, x, c4, vl);
poly = __riscv_vfmadd(poly, x, c3, vl);
poly = __riscv_vfmadd(poly, x, c2, vl);
poly = __riscv_vfmadd(poly, x, c1, vl);
poly = __riscv_vfmadd(poly, x, c0, vl);
return poly;
}

#endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */
29 changes: 29 additions & 0 deletions include/volk/volk_sse_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,33 @@ static inline __m128 _mm_cos_poly_sse(const __m128 x)
return _mm_add_ps(_mm_mul_ps(x2, poly), one);
}

/*
* Polynomial coefficients for log2(x)/(x-1) on [1, 2]
* Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
* Max error: ~1.55e-6
*
* Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
* Polynomial evaluated via Horner's method
*/
static inline __m128 _mm_log2_poly_sse(const __m128 x)
{
const __m128 c0 = _mm_set1_ps(+0x1.a8a726p+1f);
const __m128 c1 = _mm_set1_ps(-0x1.0b7f7ep+2f);
const __m128 c2 = _mm_set1_ps(+0x1.05d9ccp+2f);
const __m128 c3 = _mm_set1_ps(-0x1.4d476cp+1f);
const __m128 c4 = _mm_set1_ps(+0x1.04fc3ap+0f);
const __m128 c5 = _mm_set1_ps(-0x1.c97982p-3f);
const __m128 c6 = _mm_set1_ps(+0x1.57aa42p-6f);

// Horner's method: c0 + x*(c1 + x*(c2 + ...))
__m128 poly = c6;
poly = _mm_add_ps(_mm_mul_ps(poly, x), c5);
poly = _mm_add_ps(_mm_mul_ps(poly, x), c4);
poly = _mm_add_ps(_mm_mul_ps(poly, x), c3);
poly = _mm_add_ps(_mm_mul_ps(poly, x), c2);
poly = _mm_add_ps(_mm_mul_ps(poly, x), c1);
poly = _mm_add_ps(_mm_mul_ps(poly, x), c0);
return poly;
}

#endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
Loading