@@ -407,6 +407,35 @@ static inline float32x4_t _vcos_poly_f32(float32x4_t x)
407407 return vmlaq_f32 (one, x2, poly);
408408}
409409
410+ /*
411+ * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
412+ * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
413+ * Max error: ~1.55e-6
414+ *
415+ * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
416+ * Polynomial evaluated via Horner's method
417+ */
418+ static inline float32x4_t _vlog2_poly_f32 (float32x4_t x)
419+ {
420+ const float32x4_t c0 = vdupq_n_f32 (+0x1 .a8a726p +1f );
421+ const float32x4_t c1 = vdupq_n_f32 (-0x1 .0b7f7ep+2f );
422+ const float32x4_t c2 = vdupq_n_f32 (+0x1 .05d9ccp+2f );
423+ const float32x4_t c3 = vdupq_n_f32 (-0x1 .4d476cp+1f );
424+ const float32x4_t c4 = vdupq_n_f32 (+0x1 .04fc3ap+0f );
425+ const float32x4_t c5 = vdupq_n_f32 (-0x1 .c97982p -3f );
426+ const float32x4_t c6 = vdupq_n_f32 (+0x1 .57aa42p-6f );
427+
428+ // Horner's method: c0 + x*(c1 + x*(c2 + ...))
429+ float32x4_t poly = c6;
430+ poly = vmlaq_f32 (c5, poly, x);
431+ poly = vmlaq_f32 (c4, poly, x);
432+ poly = vmlaq_f32 (c3, poly, x);
433+ poly = vmlaq_f32 (c2, poly, x);
434+ poly = vmlaq_f32 (c1, poly, x);
435+ poly = vmlaq_f32 (c0, poly, x);
436+ return poly;
437+ }
438+
410439#ifdef LV_HAVE_NEONV8
411440/* ARMv8 NEON FMA-based arctan polynomial for better accuracy and throughput */
412441static inline float32x4_t _varctan_poly_neonv8 (float32x4_t x)
@@ -461,6 +490,32 @@ static inline float32x4_t _vcos_poly_neonv8(float32x4_t x)
461490 poly = vfmaq_f32 (c1, x2, poly);
462491 return vfmaq_f32 (one, x2, poly);
463492}
493+
494+ /*
495+ * NEONv8 FMA log2 polynomial on [1, 2]
496+ * log2(x) ≈ poly(x) * (x - 1)
497+ * Max error: ~1.55e-6
498+ */
499+ static inline float32x4_t _vlog2_poly_neonv8 (float32x4_t x)
500+ {
501+ const float32x4_t c0 = vdupq_n_f32 (+0x1 .a8a726p +1f );
502+ const float32x4_t c1 = vdupq_n_f32 (-0x1 .0b7f7ep+2f );
503+ const float32x4_t c2 = vdupq_n_f32 (+0x1 .05d9ccp+2f );
504+ const float32x4_t c3 = vdupq_n_f32 (-0x1 .4d476cp+1f );
505+ const float32x4_t c4 = vdupq_n_f32 (+0x1 .04fc3ap+0f );
506+ const float32x4_t c5 = vdupq_n_f32 (-0x1 .c97982p -3f );
507+ const float32x4_t c6 = vdupq_n_f32 (+0x1 .57aa42p-6f );
508+
509+ // Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
510+ float32x4_t poly = c6;
511+ poly = vfmaq_f32 (c5, poly, x);
512+ poly = vfmaq_f32 (c4, poly, x);
513+ poly = vfmaq_f32 (c3, poly, x);
514+ poly = vfmaq_f32 (c2, poly, x);
515+ poly = vfmaq_f32 (c1, poly, x);
516+ poly = vfmaq_f32 (c0, poly, x);
517+ return poly;
518+ }
464519#endif /* LV_HAVE_NEONV8 */
465520
466521#endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
0 commit comments