gnuradio
diff --git a/‎kernels/volk/volk_32f_log2_32f.h‎
Lines changed: 17 additions & 9 deletions b/‎kernels/volk/volk_32f_log2_32f.h‎
Lines changed: 17 additions & 9 deletions
diff --git a/‎kernels/volk/volk_32f_s32f_convert_16i.h‎
Lines changed: 10 additions & 1 deletion b/‎kernels/volk/volk_32f_s32f_convert_16i.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎kernels/volk/volk_32f_s32f_convert_32i.h‎
Lines changed: 6 additions & 0 deletions b/‎kernels/volk/volk_32f_s32f_convert_32i.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎kernels/volk/volk_32f_s32f_convert_8i.h‎
Lines changed: 14 additions & 1 deletion b/‎kernels/volk/volk_32f_s32f_convert_8i.h‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎kernels/volk/volk_32f_x2_s32f_interleave_16ic.h‎
Lines changed: 9 additions & 0 deletions b/‎kernels/volk/volk_32f_x2_s32f_interleave_16ic.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎kernels/volk/volk_32f_x3_sum_of_poly_32f.h‎
Lines changed: 5 additions & 61 deletions b/‎kernels/volk/volk_32f_x3_sum_of_poly_32f.h‎
Lines changed: 5 additions & 61 deletions
diff --git a/‎kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h‎
Lines changed: 7 additions & 0 deletions b/‎kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎kernels/volk/volk_32fc_s32f_magnitude_16i.h‎
Lines changed: 4 additions & 1 deletion b/‎kernels/volk/volk_32fc_s32f_magnitude_16i.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h‎
Lines changed: 0 additions & 96 deletions b/‎kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h‎
Lines changed: 0 additions & 96 deletions
@@ -1165,20 +1165,24 @@ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_po
     //   (-1)^sign * 2^exp * 1.significand, so the log2 is
     // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
     for (number = 0; number < quarterPoints; ++number) {
-        // Check for NaN or negative/zero (invalid inputs for log2)
+        // Check for negative, zero, and NaN inputs
         float32x4_t aval_f = vld1q_f32(aPtr);
-        uint32x4_t invalid_mask = vcleq_f32(aval_f, vdupq_n_f32(0.0f)); // aVal <= 0
+        uint32x4_t neg_mask = vcltq_f32(aval_f, vdupq_n_f32(0.0f));  // aVal < 0
+        uint32x4_t zero_mask = vceqq_f32(aval_f, vdupq_n_f32(0.0f)); // aVal == 0
         // Check for NaN: NaN comparison with itself returns false
         uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aval_f, aval_f)); // NOT(aVal == aVal)
-        invalid_mask = vorrq_u32(invalid_mask, nan_mask);           // Combine masks
+        uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask);    // neg or NaN -> NaN
         float32x4_t nan_value = vdupq_n_f32(NAN);
+        float32x4_t neg_inf_value = vdupq_n_f32(-127.0f); // log2(0) = -inf mapped to -127
 
         // load float in to an int register without conversion
         aval = vld1q_s32((int*)aPtr);
 
         VLOG2Q_NEON_F32(log2_approx, aval)
 
-        // Replace invalid results with NaN
+        // Replace zero inputs with -127.0 (log2(0) = -inf mapped to -127)
+        log2_approx = vbslq_f32(zero_mask, neg_inf_value, log2_approx);
+        // Replace negative/NaN inputs with NaN
         log2_approx = vbslq_f32(invalid_mask, nan_value, log2_approx);
 
         vst1q_f32(bPtr, log2_approx);
@@ -1220,14 +1224,16 @@ volk_32f_log2_32f_neonv8(float* bVector, const float* aVector, unsigned int num_
     const float32x4_t fone = vdupq_n_f32(1.0f);
     const float32x4_t fzero = vdupq_n_f32(0.0f);
     const float32x4_t nan_val = vdupq_n_f32(NAN);
+    const float32x4_t neg_inf_val = vdupq_n_f32(-127.0f); // log2(0) = -inf mapped to -127
 
     for (number = 0; number < quarterPoints; ++number) {
         float32x4_t aVal = vld1q_f32(aPtr);
 
-        // Check for invalid inputs (NaN, negative, or zero)
-        uint32x4_t invalid_mask = vcleq_f32(aVal, fzero);
-        uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal));
-        invalid_mask = vorrq_u32(invalid_mask, nan_mask);
+        // Check for negative, zero, and NaN inputs
+        uint32x4_t neg_mask = vcltq_f32(aVal, fzero);            // aVal < 0
+        uint32x4_t zero_mask = vceqq_f32(aVal, fzero);           // aVal == 0
+        uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal));  // NaN check
+        uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask); // neg or NaN -> NaN
 
         // Reinterpret as int for bit manipulation
         int32x4_t aVal_i = vreinterpretq_s32_f32(aVal);
@@ -1252,7 +1258,9 @@ volk_32f_log2_32f_neonv8(float* bVector, const float* aVector, unsigned int num_
         // result = exp + mantissa * (frac - 1.0)
         float32x4_t bVal = vfmaq_f32(exp_f, mantissa, vsubq_f32(frac, fone));
 
-        // Replace invalid results with NaN
+        // Replace zero inputs with -127.0 (log2(0) = -inf mapped to -127)
+        bVal = vbslq_f32(zero_mask, neg_inf_val, bVal);
+        // Replace negative/NaN inputs with NaN
         bVal = vbslq_f32(invalid_mask, nan_val, bVal);
 
         vst1q_f32(bPtr, bVal);
 
@@ -692,7 +692,16 @@ static inline void volk_32f_s32f_convert_16i_neon(int16_t* outputVector,
         float32x4_t ret2 =
             vmaxq_f32(vminq_f32(vmulq_f32(inputVal2, vScalar), vmax_val), vmin_val);
 
-        // Convert to int32 (truncates towards zero)
+        // Round to nearest: add copysign(0.5, x) before truncating
+        float32x4_t half = vdupq_n_f32(0.5f);
+        float32x4_t neg_half = vdupq_n_f32(-0.5f);
+        float32x4_t zero = vdupq_n_f32(0.0f);
+        uint32x4_t neg1 = vcltq_f32(ret1, zero);
+        uint32x4_t neg2 = vcltq_f32(ret2, zero);
+        ret1 = vaddq_f32(ret1, vbslq_f32(neg1, neg_half, half));
+        ret2 = vaddq_f32(ret2, vbslq_f32(neg2, neg_half, half));
+
+        // Convert to int32 (truncates towards zero, but we pre-rounded)
         int32x4_t intVal1 = vcvtq_s32_f32(ret1);
         int32x4_t intVal2 = vcvtq_s32_f32(ret2);
 
 
@@ -425,11 +425,17 @@ static inline void volk_32f_s32f_convert_32i_neon(int32_t* outputVector,
     float32x4_t vScalar = vdupq_n_f32(scalar);
     float32x4_t vmin_val = vdupq_n_f32(min_val);
     float32x4_t vmax_val = vdupq_n_f32(max_val);
+    float32x4_t half = vdupq_n_f32(0.5f);
+    float32x4_t neg_half = vdupq_n_f32(-0.5f);
+    float32x4_t zero = vdupq_n_f32(0.0f);
 
     for (; number < quarter_points; number++) {
         float32x4_t inputVal = vld1q_f32(inputPtr);
         inputVal = vmulq_f32(inputVal, vScalar);
         inputVal = vmaxq_f32(vminq_f32(inputVal, vmax_val), vmin_val);
+        // Round to nearest: add copysign(0.5, x) before truncating
+        uint32x4_t neg = vcltq_f32(inputVal, zero);
+        inputVal = vaddq_f32(inputVal, vbslq_f32(neg, neg_half, half));
         int32x4_t intVal = vcvtq_s32_f32(inputVal);
         vst1q_s32(outputPtr, intVal);
         inputPtr += 4;
 
@@ -576,6 +576,9 @@ static inline void volk_32f_s32f_convert_8i_neon(int8_t* outputVector,
     float32x4_t vScalar = vdupq_n_f32(scalar);
     float32x4_t vmin_val = vdupq_n_f32(min_val);
     float32x4_t vmax_val = vdupq_n_f32(max_val);
+    float32x4_t half = vdupq_n_f32(0.5f);
+    float32x4_t neg_half = vdupq_n_f32(-0.5f);
+    float32x4_t zero = vdupq_n_f32(0.0f);
 
     for (; number < sixteenthPoints; number++) {
         float32x4_t inputVal0 = vld1q_f32(inputVectorPtr);
@@ -594,7 +597,17 @@ static inline void volk_32f_s32f_convert_8i_neon(int8_t* outputVector,
         float32x4_t ret3 =
             vmaxq_f32(vminq_f32(vmulq_f32(inputVal3, vScalar), vmax_val), vmin_val);
 
-        // Convert to int32 (truncates towards zero)
+        // Round to nearest: add copysign(0.5, x) before truncating
+        uint32x4_t neg0 = vcltq_f32(ret0, zero);
+        uint32x4_t neg1 = vcltq_f32(ret1, zero);
+        uint32x4_t neg2 = vcltq_f32(ret2, zero);
+        uint32x4_t neg3 = vcltq_f32(ret3, zero);
+        ret0 = vaddq_f32(ret0, vbslq_f32(neg0, neg_half, half));
+        ret1 = vaddq_f32(ret1, vbslq_f32(neg1, neg_half, half));
+        ret2 = vaddq_f32(ret2, vbslq_f32(neg2, neg_half, half));
+        ret3 = vaddq_f32(ret3, vbslq_f32(neg3, neg_half, half));
+
+        // Convert to int32 (truncates towards zero, but we pre-rounded)
         int32x4_t intVal0 = vcvtq_s32_f32(ret0);
         int32x4_t intVal1 = vcvtq_s32_f32(ret1);
         int32x4_t intVal2 = vcvtq_s32_f32(ret2);
 
@@ -343,6 +343,9 @@ static inline void volk_32f_x2_s32f_interleave_16ic_neon(lv_16sc_t* complexVecto
     int16_t* complexVectorPtr = (int16_t*)complexVector;
 
     float32x4_t vScalar = vdupq_n_f32(scalar);
+    float32x4_t half = vdupq_n_f32(0.5f);
+    float32x4_t neg_half = vdupq_n_f32(-0.5f);
+    float32x4_t zero = vdupq_n_f32(0.0f);
 
     for (; number < quarter_points; number++) {
         float32x4_t iValue = vld1q_f32(iBufferPtr);
@@ -351,6 +354,12 @@ static inline void volk_32f_x2_s32f_interleave_16ic_neon(lv_16sc_t* complexVecto
         iValue = vmulq_f32(iValue, vScalar);
         qValue = vmulq_f32(qValue, vScalar);
 
+        // Round to nearest: add copysign(0.5, x) before truncating
+        uint32x4_t iNeg = vcltq_f32(iValue, zero);
+        uint32x4_t qNeg = vcltq_f32(qValue, zero);
+        iValue = vaddq_f32(iValue, vbslq_f32(iNeg, neg_half, half));
+        qValue = vaddq_f32(qValue, vbslq_f32(qNeg, neg_half, half));
+
         int32x4_t iInt = vcvtq_s32_f32(iValue);
         int32x4_t qInt = vcvtq_s32_f32(qValue);
 
 
@@ -364,66 +364,11 @@ static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
 #ifdef LV_HAVE_NEON
 #include <arm_neon.h>
 
-static inline void
-volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
-                                   float* __restrict src0,
-                                   float* __restrict center_point_array,
-                                   float* __restrict cutoff,
-                                   unsigned int num_points)
-{
-    unsigned int i;
-    float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
-
-    float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
-    float32x2_t cutoff_vector;
-    float32x2x2_t x_low, x_high;
-    float32x4_t x_qvector, c_qvector, cpa_qvector;
-    float accumulator;
-    float res_accumulators[4];
-
-    c_qvector = vld1q_f32(zero);
-    // load the cutoff in to a vector
-    cutoff_vector = vdup_n_f32(*cutoff);
-    // ... center point array
-    cpa_qvector = vld1q_f32(center_point_array);
-
-    for (i = 0; i < num_points; ++i) {
-        // load x  (src0)
-        x_to_1 = vdup_n_f32(*src0++);
-
-        // Get a vector of max(src0, cutoff)
-        x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
-        x_to_2 = vmul_f32(x_to_1, x_to_1);        // x^2
-        x_to_3 = vmul_f32(x_to_2, x_to_1);        // x^3
-        x_to_4 = vmul_f32(x_to_3, x_to_1);        // x^4
-        // zip up doubles to interleave
-        x_low = vzip_f32(x_to_1, x_to_2);  // [x^2 | x^1 || x^2 | x^1]
-        x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
-        // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
-        x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
-        // now we finally have [x^4 | x^3 | x^2 | x] !
-
-        c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
-    }
-    // there should be better vector reduction techniques
-    vst1q_f32(res_accumulators, c_qvector);
-    accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
-                  res_accumulators[3];
-
-    *target = accumulator + (float)num_points * center_point_array[4];
-}
-
-#endif /* LV_HAVE_NEON */
-
-
-#ifdef LV_HAVE_NEON
-
-static inline void
-volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
-                                     float* __restrict src0,
-                                     float* __restrict center_point_array,
-                                     float* __restrict cutoff,
-                                     unsigned int num_points)
+static inline void volk_32f_x3_sum_of_poly_32f_neon(float* __restrict target,
+                                                    float* __restrict src0,
+                                                    float* __restrict center_point_array,
+                                                    float* __restrict cutoff,
+                                                    unsigned int num_points)
 {
     unsigned int i;
     float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
@@ -446,7 +391,6 @@ volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
     cpa_2 = vdupq_n_f32(center_point_array[2]);
     cpa_3 = vdupq_n_f32(center_point_array[3]);
 
-    // nathan is not sure why this is slower *and* wrong compared to neonvertfma
     for (i = 0; i < num_points / 4; ++i) {
         // load x
         x_to_1 = vld1q_f32(src0);
 
@@ -269,11 +269,18 @@ volk_32fc_s32f_deinterleave_real_16i_neon(int16_t* iBuffer,
     int16_t* iBufferPtr = iBuffer;
     float32x4_t vScalar = vdupq_n_f32(scalar);
 
+    float32x4_t half = vdupq_n_f32(0.5f);
+    float32x4_t neg_half = vdupq_n_f32(-0.5f);
+    float32x4_t zero = vdupq_n_f32(0.0f);
+
     for (; number < quarter_points; number++) {
         float32x4x2_t input = vld2q_f32(complexVectorPtr);
         complexVectorPtr += 8;
 
         float32x4_t scaled = vmulq_f32(input.val[0], vScalar);
+        // Round to nearest: add copysign(0.5, x) before truncating
+        uint32x4_t neg = vcltq_f32(scaled, zero);
+        scaled = vaddq_f32(scaled, vbslq_f32(neg, neg_half, half));
         int32x4_t intVal = vcvtq_s32_f32(scaled);
         int16x4_t shortVal = vqmovn_s32(intVal);
 
 
@@ -317,6 +317,8 @@ static inline void volk_32fc_s32f_magnitude_16i_neon(int16_t* magnitudeVector,
     int16_t* magnitudeVectorPtr = magnitudeVector;
     float32x4_t vScalar = vdupq_n_f32(scalar);
 
+    float32x4_t half = vdupq_n_f32(0.5f);
+
     for (; number < quarter_points; number++) {
         float32x4x2_t input = vld2q_f32(complexVectorPtr);
         complexVectorPtr += 8;
@@ -335,7 +337,8 @@ static inline void volk_32fc_s32f_magnitude_16i_neon(int16_t* magnitudeVector,
         uint32x4_t zero_mask = vceqq_f32(sumSquared, vdupq_n_f32(0.0f));
         magnitude = vbslq_f32(zero_mask, sumSquared, magnitude);
 
-        float32x4_t scaled = vmulq_f32(magnitude, vScalar);
+        // Magnitude is always non-negative, so just add 0.5 for rounding
+        float32x4_t scaled = vaddq_f32(vmulq_f32(magnitude, vScalar), half);
         int32x4_t intVal = vcvtq_s32_f32(scaled);
         int16x4_t shortVal = vqmovn_s32(intVal);
 
 
@@ -55,63 +55,6 @@ static inline void volk_32fc_s32fc_rotator2puppet_32fc_neon(lv_32fc_t* outVector
 #endif /* LV_HAVE_NEON */
 
 
-#ifdef LV_HAVE_NEONV8
-#include <arm_neon.h>
-
-static inline void volk_32fc_s32fc_rotator2puppet_32fc_neonv8(lv_32fc_t* outVector,
-                                                              const lv_32fc_t* inVector,
-                                                              const lv_32fc_t* phase_inc,
-                                                              unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
-    volk_32fc_s32fc_x2_rotator2_32fc_neonv8(
-        outVector, inVector, &phase_inc_n, phase, num_points);
-}
-#endif /* LV_HAVE_NEONV8 */
-
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-static inline void
-volk_32fc_s32fc_rotator2puppet_32fc_a_sse4_1(lv_32fc_t* outVector,
-                                             const lv_32fc_t* inVector,
-                                             const lv_32fc_t* phase_inc,
-                                             unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
-    volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(
-        outVector, inVector, &phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-static inline void
-volk_32fc_s32fc_rotator2puppet_32fc_u_sse4_1(lv_32fc_t* outVector,
-                                             const lv_32fc_t* inVector,
-                                             const lv_32fc_t* phase_inc,
-                                             unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
-    volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(
-        outVector, inVector, &phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_SSE4_1 */
-
-
 #ifdef LV_HAVE_AVX
 #include <immintrin.h>
 
@@ -189,45 +132,6 @@ volk_32fc_s32fc_rotator2puppet_32fc_u_avx512f(lv_32fc_t* outVector,
 
 #endif /* LV_HAVE_AVX512F */
 
-#if LV_HAVE_AVX && LV_HAVE_FMA
-#include <immintrin.h>
-
-static inline void
-volk_32fc_s32fc_rotator2puppet_32fc_a_avx_fma(lv_32fc_t* outVector,
-                                              const lv_32fc_t* inVector,
-                                              const lv_32fc_t* phase_inc,
-                                              unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
-    volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(
-        outVector, inVector, &phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
-
-
-#if LV_HAVE_AVX && LV_HAVE_FMA
-#include <immintrin.h>
-
-static inline void
-volk_32fc_s32fc_rotator2puppet_32fc_u_avx_fma(lv_32fc_t* outVector,
-                                              const lv_32fc_t* inVector,
-                                              const lv_32fc_t* phase_inc,
-                                              unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
-    volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(
-        outVector, inVector, &phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
-
 #ifdef LV_HAVE_RVV
 static inline void volk_32fc_s32fc_rotator2puppet_32fc_rvv(lv_32fc_t* outVector,
                                                            const lv_32fc_t* inVector,