Merge pull request #812 from Ka-zam/log2_edge_cases

jdemel · web-flow · commit 3cf2f5382b1e · 2026-01-29T21:20:42.000+01:00
Log2 edge cases
diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
@@ -110,4 +110,33 @@ static inline __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
     return _mm256_fmadd_ps(x2, poly, one);
 }
 
+/*
+ * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
+ * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
+ * Max error: ~1.55e-6
+ *
+ * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
+ * Polynomial evaluated via Horner's method with FMA
+ */
+static inline __m256 _mm256_log2_poly_avx2_fma(const __m256 x)
+{
+    const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
+    const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
+    const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
+    const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
+    const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
+    const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
+    const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
+
+    // Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
+    __m256 poly = c6;
+    poly = _mm256_fmadd_ps(poly, x, c5);
+    poly = _mm256_fmadd_ps(poly, x, c4);
+    poly = _mm256_fmadd_ps(poly, x, c3);
+    poly = _mm256_fmadd_ps(poly, x, c2);
+    poly = _mm256_fmadd_ps(poly, x, c1);
+    poly = _mm256_fmadd_ps(poly, x, c0);
+    return poly;
+}
+
 #endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h
@@ -414,4 +414,33 @@ static inline __m256 _mm256_cos_poly_avx2(const __m256 x)
     return _mm256_add_ps(_mm256_mul_ps(x2, poly), one);
 }
 
+/*
+ * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
+ * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
+ * Max error: ~1.55e-6
+ *
+ * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _mm256_log2_poly_avx2(const __m256 x)
+{
+    const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
+    const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
+    const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
+    const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
+    const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
+    const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
+    const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
+
+    // Horner's method: c0 + x*(c1 + x*(c2 + ...))
+    __m256 poly = c6;
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);
+    return poly;
+}
+
 #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h
@@ -238,4 +238,34 @@ static inline __m512 _mm512_cos_poly_avx512(const __m512 x)
     return _mm512_fmadd_ps(x2, poly, one);
 }
 
+////////////////////////////////////////////////////////////////////////
+// Polynomial coefficients for log2(x)/(x-1) on [1, 2]
+// Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
+// Max error: ~1.55e-6
+//
+// Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
+// Polynomial evaluated via Horner's method with FMA
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
+static inline __m512 _mm512_log2_poly_avx512(const __m512 x)
+{
+    const __m512 c0 = _mm512_set1_ps(+0x1.a8a726p+1f);
+    const __m512 c1 = _mm512_set1_ps(-0x1.0b7f7ep+2f);
+    const __m512 c2 = _mm512_set1_ps(+0x1.05d9ccp+2f);
+    const __m512 c3 = _mm512_set1_ps(-0x1.4d476cp+1f);
+    const __m512 c4 = _mm512_set1_ps(+0x1.04fc3ap+0f);
+    const __m512 c5 = _mm512_set1_ps(-0x1.c97982p-3f);
+    const __m512 c6 = _mm512_set1_ps(+0x1.57aa42p-6f);
+
+    // Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
+    __m512 poly = c6;
+    poly = _mm512_fmadd_ps(poly, x, c5);
+    poly = _mm512_fmadd_ps(poly, x, c4);
+    poly = _mm512_fmadd_ps(poly, x, c3);
+    poly = _mm512_fmadd_ps(poly, x, c2);
+    poly = _mm512_fmadd_ps(poly, x, c1);
+    poly = _mm512_fmadd_ps(poly, x, c0);
+    return poly;
+}
+
 #endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -293,4 +293,33 @@ static inline __m256 _mm256_accumulate_square_sum_ps(
     return _mm256_add_ps(sq_acc, aux);
 }
 
+/*
+ * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
+ * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
+ * Max error: ~1.55e-6
+ *
+ * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _mm256_log2_poly_avx(const __m256 x)
+{
+    const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
+    const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
+    const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
+    const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
+    const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
+    const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
+    const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
+
+    // Horner's method: c0 + x*(c1 + x*(c2 + ...))
+    __m256 poly = c6;
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
+    poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);
+    return poly;
+}
+
 #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h
@@ -407,6 +407,35 @@ static inline float32x4_t _vcos_poly_f32(float32x4_t x)
     return vmlaq_f32(one, x2, poly);
 }
 
+/*
+ * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
+ * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
+ * Max error: ~1.55e-6
+ *
+ * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
+ * Polynomial evaluated via Horner's method
+ */
+static inline float32x4_t _vlog2_poly_f32(float32x4_t x)
+{
+    const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
+    const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
+    const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
+    const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
+    const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
+    const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
+    const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
+
+    // Horner's method: c0 + x*(c1 + x*(c2 + ...))
+    float32x4_t poly = c6;
+    poly = vmlaq_f32(c5, poly, x);
+    poly = vmlaq_f32(c4, poly, x);
+    poly = vmlaq_f32(c3, poly, x);
+    poly = vmlaq_f32(c2, poly, x);
+    poly = vmlaq_f32(c1, poly, x);
+    poly = vmlaq_f32(c0, poly, x);
+    return poly;
+}
+
 #ifdef LV_HAVE_NEONV8
 /* ARMv8 NEON FMA-based arctan polynomial for better accuracy and throughput */
 static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
@@ -461,6 +490,32 @@ static inline float32x4_t _vcos_poly_neonv8(float32x4_t x)
     poly = vfmaq_f32(c1, x2, poly);
     return vfmaq_f32(one, x2, poly);
 }
+
+/*
+ * NEONv8 FMA log2 polynomial on [1, 2]
+ * log2(x) ≈ poly(x) * (x - 1)
+ * Max error: ~1.55e-6
+ */
+static inline float32x4_t _vlog2_poly_neonv8(float32x4_t x)
+{
+    const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
+    const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
+    const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
+    const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
+    const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
+    const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
+    const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
+
+    // Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
+    float32x4_t poly = c6;
+    poly = vfmaq_f32(c5, poly, x);
+    poly = vfmaq_f32(c4, poly, x);
+    poly = vfmaq_f32(c3, poly, x);
+    poly = vfmaq_f32(c2, poly, x);
+    poly = vfmaq_f32(c1, poly, x);
+    poly = vfmaq_f32(c0, poly, x);
+    return poly;
+}
 #endif /* LV_HAVE_NEONV8 */
 
 #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
diff --git a/include/volk/volk_rvv_intrinsics.h b/include/volk/volk_rvv_intrinsics.h
@@ -74,4 +74,39 @@
 
 #define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl)
 
+/*
+ * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
+ * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
+ * Max error: ~1.55e-6
+ *
+ * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
+ * Polynomial evaluated via Horner's method with FMA
+ *
+ * Parameters:
+ *   x: mantissa values in [1, 2)
+ *   vl: vector length for operations
+ *   vlmax: maximum vector length used for creating coefficient vectors
+ */
+static inline vfloat32m2_t
+__riscv_vlog2_poly_f32m2(vfloat32m2_t x, size_t vl, size_t vlmax)
+{
+    const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(+0x1.a8a726p+1f, vlmax);
+    const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0x1.0b7f7ep+2f, vlmax);
+    const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(+0x1.05d9ccp+2f, vlmax);
+    const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.4d476cp+1f, vlmax);
+    const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(+0x1.04fc3ap+0f, vlmax);
+    const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(-0x1.c97982p-3f, vlmax);
+    const vfloat32m2_t c6 = __riscv_vfmv_v_f_f32m2(+0x1.57aa42p-6f, vlmax);
+
+    // Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
+    vfloat32m2_t poly = c6;
+    poly = __riscv_vfmadd(poly, x, c5, vl);
+    poly = __riscv_vfmadd(poly, x, c4, vl);
+    poly = __riscv_vfmadd(poly, x, c3, vl);
+    poly = __riscv_vfmadd(poly, x, c2, vl);
+    poly = __riscv_vfmadd(poly, x, c1, vl);
+    poly = __riscv_vfmadd(poly, x, c0, vl);
+    return poly;
+}
+
 #endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */
diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h
@@ -188,4 +188,33 @@ static inline __m128 _mm_cos_poly_sse(const __m128 x)
     return _mm_add_ps(_mm_mul_ps(x2, poly), one);
 }
 
+/*
+ * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
+ * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
+ * Max error: ~1.55e-6
+ *
+ * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m128 _mm_log2_poly_sse(const __m128 x)
+{
+    const __m128 c0 = _mm_set1_ps(+0x1.a8a726p+1f);
+    const __m128 c1 = _mm_set1_ps(-0x1.0b7f7ep+2f);
+    const __m128 c2 = _mm_set1_ps(+0x1.05d9ccp+2f);
+    const __m128 c3 = _mm_set1_ps(-0x1.4d476cp+1f);
+    const __m128 c4 = _mm_set1_ps(+0x1.04fc3ap+0f);
+    const __m128 c5 = _mm_set1_ps(-0x1.c97982p-3f);
+    const __m128 c6 = _mm_set1_ps(+0x1.57aa42p-6f);
+
+    // Horner's method: c0 + x*(c1 + x*(c2 + ...))
+    __m128 poly = c6;
+    poly = _mm_add_ps(_mm_mul_ps(poly, x), c5);
+    poly = _mm_add_ps(_mm_mul_ps(poly, x), c4);
+    poly = _mm_add_ps(_mm_mul_ps(poly, x), c3);
+    poly = _mm_add_ps(_mm_mul_ps(poly, x), c2);
+    poly = _mm_add_ps(_mm_mul_ps(poly, x), c1);
+    poly = _mm_add_ps(_mm_mul_ps(poly, x), c0);
+    return poly;
+}
+
 #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h
diff --git a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h