Skip to content

Commit fcbbac8

Browse files
authored
Merge pull request #807 from Ka-zam/rotator_fix
Rotator fix
2 parents 837e4b2 + 2c1515f commit fcbbac8

14 files changed

+627
-992
lines changed

kernels/volk/volk_32f_log2_32f.h

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,20 +1165,24 @@ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_po
11651165
// (-1)^sign * 2^exp * 1.significand, so the log2 is
11661166
// log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
11671167
for (number = 0; number < quarterPoints; ++number) {
1168-
// Check for NaN or negative/zero (invalid inputs for log2)
1168+
// Check for negative, zero, and NaN inputs
11691169
float32x4_t aval_f = vld1q_f32(aPtr);
1170-
uint32x4_t invalid_mask = vcleq_f32(aval_f, vdupq_n_f32(0.0f)); // aVal <= 0
1170+
uint32x4_t neg_mask = vcltq_f32(aval_f, vdupq_n_f32(0.0f)); // aVal < 0
1171+
uint32x4_t zero_mask = vceqq_f32(aval_f, vdupq_n_f32(0.0f)); // aVal == 0
11711172
// Check for NaN: NaN comparison with itself returns false
11721173
uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aval_f, aval_f)); // NOT(aVal == aVal)
1173-
invalid_mask = vorrq_u32(invalid_mask, nan_mask); // Combine masks
1174+
uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask); // neg or NaN -> NaN
11741175
float32x4_t nan_value = vdupq_n_f32(NAN);
1176+
float32x4_t neg_inf_value = vdupq_n_f32(-127.0f); // log2(0) = -inf mapped to -127
11751177

11761178
// load float in to an int register without conversion
11771179
aval = vld1q_s32((int*)aPtr);
11781180

11791181
VLOG2Q_NEON_F32(log2_approx, aval)
11801182

1181-
// Replace invalid results with NaN
1183+
// Replace zero inputs with -127.0 (log2(0) = -inf mapped to -127)
1184+
log2_approx = vbslq_f32(zero_mask, neg_inf_value, log2_approx);
1185+
// Replace negative/NaN inputs with NaN
11821186
log2_approx = vbslq_f32(invalid_mask, nan_value, log2_approx);
11831187

11841188
vst1q_f32(bPtr, log2_approx);
@@ -1220,14 +1224,16 @@ volk_32f_log2_32f_neonv8(float* bVector, const float* aVector, unsigned int num_
12201224
const float32x4_t fone = vdupq_n_f32(1.0f);
12211225
const float32x4_t fzero = vdupq_n_f32(0.0f);
12221226
const float32x4_t nan_val = vdupq_n_f32(NAN);
1227+
const float32x4_t neg_inf_val = vdupq_n_f32(-127.0f); // log2(0) = -inf mapped to -127
12231228

12241229
for (number = 0; number < quarterPoints; ++number) {
12251230
float32x4_t aVal = vld1q_f32(aPtr);
12261231

1227-
// Check for invalid inputs (NaN, negative, or zero)
1228-
uint32x4_t invalid_mask = vcleq_f32(aVal, fzero);
1229-
uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal));
1230-
invalid_mask = vorrq_u32(invalid_mask, nan_mask);
1232+
// Check for negative, zero, and NaN inputs
1233+
uint32x4_t neg_mask = vcltq_f32(aVal, fzero); // aVal < 0
1234+
uint32x4_t zero_mask = vceqq_f32(aVal, fzero); // aVal == 0
1235+
uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal)); // NaN check
1236+
uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask); // neg or NaN -> NaN
12311237

12321238
// Reinterpret as int for bit manipulation
12331239
int32x4_t aVal_i = vreinterpretq_s32_f32(aVal);
@@ -1252,7 +1258,9 @@ volk_32f_log2_32f_neonv8(float* bVector, const float* aVector, unsigned int num_
12521258
// result = exp + mantissa * (frac - 1.0)
12531259
float32x4_t bVal = vfmaq_f32(exp_f, mantissa, vsubq_f32(frac, fone));
12541260

1255-
// Replace invalid results with NaN
1261+
// Replace zero inputs with -127.0 (log2(0) = -inf mapped to -127)
1262+
bVal = vbslq_f32(zero_mask, neg_inf_val, bVal);
1263+
// Replace negative/NaN inputs with NaN
12561264
bVal = vbslq_f32(invalid_mask, nan_val, bVal);
12571265

12581266
vst1q_f32(bPtr, bVal);

kernels/volk/volk_32f_s32f_convert_16i.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,16 @@ static inline void volk_32f_s32f_convert_16i_neon(int16_t* outputVector,
692692
float32x4_t ret2 =
693693
vmaxq_f32(vminq_f32(vmulq_f32(inputVal2, vScalar), vmax_val), vmin_val);
694694

695-
// Convert to int32 (truncates towards zero)
695+
// Round to nearest: add copysign(0.5, x) before truncating
696+
float32x4_t half = vdupq_n_f32(0.5f);
697+
float32x4_t neg_half = vdupq_n_f32(-0.5f);
698+
float32x4_t zero = vdupq_n_f32(0.0f);
699+
uint32x4_t neg1 = vcltq_f32(ret1, zero);
700+
uint32x4_t neg2 = vcltq_f32(ret2, zero);
701+
ret1 = vaddq_f32(ret1, vbslq_f32(neg1, neg_half, half));
702+
ret2 = vaddq_f32(ret2, vbslq_f32(neg2, neg_half, half));
703+
704+
// Convert to int32 (truncates towards zero, but we pre-rounded)
696705
int32x4_t intVal1 = vcvtq_s32_f32(ret1);
697706
int32x4_t intVal2 = vcvtq_s32_f32(ret2);
698707

kernels/volk/volk_32f_s32f_convert_32i.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,11 +425,17 @@ static inline void volk_32f_s32f_convert_32i_neon(int32_t* outputVector,
425425
float32x4_t vScalar = vdupq_n_f32(scalar);
426426
float32x4_t vmin_val = vdupq_n_f32(min_val);
427427
float32x4_t vmax_val = vdupq_n_f32(max_val);
428+
float32x4_t half = vdupq_n_f32(0.5f);
429+
float32x4_t neg_half = vdupq_n_f32(-0.5f);
430+
float32x4_t zero = vdupq_n_f32(0.0f);
428431

429432
for (; number < quarter_points; number++) {
430433
float32x4_t inputVal = vld1q_f32(inputPtr);
431434
inputVal = vmulq_f32(inputVal, vScalar);
432435
inputVal = vmaxq_f32(vminq_f32(inputVal, vmax_val), vmin_val);
436+
// Round to nearest: add copysign(0.5, x) before truncating
437+
uint32x4_t neg = vcltq_f32(inputVal, zero);
438+
inputVal = vaddq_f32(inputVal, vbslq_f32(neg, neg_half, half));
433439
int32x4_t intVal = vcvtq_s32_f32(inputVal);
434440
vst1q_s32(outputPtr, intVal);
435441
inputPtr += 4;

kernels/volk/volk_32f_s32f_convert_8i.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,9 @@ static inline void volk_32f_s32f_convert_8i_neon(int8_t* outputVector,
576576
float32x4_t vScalar = vdupq_n_f32(scalar);
577577
float32x4_t vmin_val = vdupq_n_f32(min_val);
578578
float32x4_t vmax_val = vdupq_n_f32(max_val);
579+
float32x4_t half = vdupq_n_f32(0.5f);
580+
float32x4_t neg_half = vdupq_n_f32(-0.5f);
581+
float32x4_t zero = vdupq_n_f32(0.0f);
579582

580583
for (; number < sixteenthPoints; number++) {
581584
float32x4_t inputVal0 = vld1q_f32(inputVectorPtr);
@@ -594,7 +597,17 @@ static inline void volk_32f_s32f_convert_8i_neon(int8_t* outputVector,
594597
float32x4_t ret3 =
595598
vmaxq_f32(vminq_f32(vmulq_f32(inputVal3, vScalar), vmax_val), vmin_val);
596599

597-
// Convert to int32 (truncates towards zero)
600+
// Round to nearest: add copysign(0.5, x) before truncating
601+
uint32x4_t neg0 = vcltq_f32(ret0, zero);
602+
uint32x4_t neg1 = vcltq_f32(ret1, zero);
603+
uint32x4_t neg2 = vcltq_f32(ret2, zero);
604+
uint32x4_t neg3 = vcltq_f32(ret3, zero);
605+
ret0 = vaddq_f32(ret0, vbslq_f32(neg0, neg_half, half));
606+
ret1 = vaddq_f32(ret1, vbslq_f32(neg1, neg_half, half));
607+
ret2 = vaddq_f32(ret2, vbslq_f32(neg2, neg_half, half));
608+
ret3 = vaddq_f32(ret3, vbslq_f32(neg3, neg_half, half));
609+
610+
// Convert to int32 (truncates towards zero, but we pre-rounded)
598611
int32x4_t intVal0 = vcvtq_s32_f32(ret0);
599612
int32x4_t intVal1 = vcvtq_s32_f32(ret1);
600613
int32x4_t intVal2 = vcvtq_s32_f32(ret2);

kernels/volk/volk_32f_x2_s32f_interleave_16ic.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,9 @@ static inline void volk_32f_x2_s32f_interleave_16ic_neon(lv_16sc_t* complexVecto
343343
int16_t* complexVectorPtr = (int16_t*)complexVector;
344344

345345
float32x4_t vScalar = vdupq_n_f32(scalar);
346+
float32x4_t half = vdupq_n_f32(0.5f);
347+
float32x4_t neg_half = vdupq_n_f32(-0.5f);
348+
float32x4_t zero = vdupq_n_f32(0.0f);
346349

347350
for (; number < quarter_points; number++) {
348351
float32x4_t iValue = vld1q_f32(iBufferPtr);
@@ -351,6 +354,12 @@ static inline void volk_32f_x2_s32f_interleave_16ic_neon(lv_16sc_t* complexVecto
351354
iValue = vmulq_f32(iValue, vScalar);
352355
qValue = vmulq_f32(qValue, vScalar);
353356

357+
// Round to nearest: add copysign(0.5, x) before truncating
358+
uint32x4_t iNeg = vcltq_f32(iValue, zero);
359+
uint32x4_t qNeg = vcltq_f32(qValue, zero);
360+
iValue = vaddq_f32(iValue, vbslq_f32(iNeg, neg_half, half));
361+
qValue = vaddq_f32(qValue, vbslq_f32(qNeg, neg_half, half));
362+
354363
int32x4_t iInt = vcvtq_s32_f32(iValue);
355364
int32x4_t qInt = vcvtq_s32_f32(qValue);
356365

kernels/volk/volk_32f_x3_sum_of_poly_32f.h

Lines changed: 5 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -364,66 +364,11 @@ static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
364364
#ifdef LV_HAVE_NEON
365365
#include <arm_neon.h>
366366

367-
static inline void
368-
volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
369-
float* __restrict src0,
370-
float* __restrict center_point_array,
371-
float* __restrict cutoff,
372-
unsigned int num_points)
373-
{
374-
unsigned int i;
375-
float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
376-
377-
float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
378-
float32x2_t cutoff_vector;
379-
float32x2x2_t x_low, x_high;
380-
float32x4_t x_qvector, c_qvector, cpa_qvector;
381-
float accumulator;
382-
float res_accumulators[4];
383-
384-
c_qvector = vld1q_f32(zero);
385-
// load the cutoff in to a vector
386-
cutoff_vector = vdup_n_f32(*cutoff);
387-
// ... center point array
388-
cpa_qvector = vld1q_f32(center_point_array);
389-
390-
for (i = 0; i < num_points; ++i) {
391-
// load x (src0)
392-
x_to_1 = vdup_n_f32(*src0++);
393-
394-
// Get a vector of max(src0, cutoff)
395-
x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
396-
x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
397-
x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
398-
x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
399-
// zip up doubles to interleave
400-
x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
401-
x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
402-
// float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
403-
x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
404-
// now we finally have [x^4 | x^3 | x^2 | x] !
405-
406-
c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
407-
}
408-
// there should be better vector reduction techniques
409-
vst1q_f32(res_accumulators, c_qvector);
410-
accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
411-
res_accumulators[3];
412-
413-
*target = accumulator + (float)num_points * center_point_array[4];
414-
}
415-
416-
#endif /* LV_HAVE_NEON */
417-
418-
419-
#ifdef LV_HAVE_NEON
420-
421-
static inline void
422-
volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
423-
float* __restrict src0,
424-
float* __restrict center_point_array,
425-
float* __restrict cutoff,
426-
unsigned int num_points)
367+
static inline void volk_32f_x3_sum_of_poly_32f_neon(float* __restrict target,
368+
float* __restrict src0,
369+
float* __restrict center_point_array,
370+
float* __restrict cutoff,
371+
unsigned int num_points)
427372
{
428373
unsigned int i;
429374
float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
@@ -446,7 +391,6 @@ volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
446391
cpa_2 = vdupq_n_f32(center_point_array[2]);
447392
cpa_3 = vdupq_n_f32(center_point_array[3]);
448393

449-
// nathan is not sure why this is slower *and* wrong compared to neonvertfma
450394
for (i = 0; i < num_points / 4; ++i) {
451395
// load x
452396
x_to_1 = vld1q_f32(src0);

kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,11 +269,18 @@ volk_32fc_s32f_deinterleave_real_16i_neon(int16_t* iBuffer,
269269
int16_t* iBufferPtr = iBuffer;
270270
float32x4_t vScalar = vdupq_n_f32(scalar);
271271

272+
float32x4_t half = vdupq_n_f32(0.5f);
273+
float32x4_t neg_half = vdupq_n_f32(-0.5f);
274+
float32x4_t zero = vdupq_n_f32(0.0f);
275+
272276
for (; number < quarter_points; number++) {
273277
float32x4x2_t input = vld2q_f32(complexVectorPtr);
274278
complexVectorPtr += 8;
275279

276280
float32x4_t scaled = vmulq_f32(input.val[0], vScalar);
281+
// Round to nearest: add copysign(0.5, x) before truncating
282+
uint32x4_t neg = vcltq_f32(scaled, zero);
283+
scaled = vaddq_f32(scaled, vbslq_f32(neg, neg_half, half));
277284
int32x4_t intVal = vcvtq_s32_f32(scaled);
278285
int16x4_t shortVal = vqmovn_s32(intVal);
279286

kernels/volk/volk_32fc_s32f_magnitude_16i.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,8 @@ static inline void volk_32fc_s32f_magnitude_16i_neon(int16_t* magnitudeVector,
317317
int16_t* magnitudeVectorPtr = magnitudeVector;
318318
float32x4_t vScalar = vdupq_n_f32(scalar);
319319

320+
float32x4_t half = vdupq_n_f32(0.5f);
321+
320322
for (; number < quarter_points; number++) {
321323
float32x4x2_t input = vld2q_f32(complexVectorPtr);
322324
complexVectorPtr += 8;
@@ -335,7 +337,8 @@ static inline void volk_32fc_s32f_magnitude_16i_neon(int16_t* magnitudeVector,
335337
uint32x4_t zero_mask = vceqq_f32(sumSquared, vdupq_n_f32(0.0f));
336338
magnitude = vbslq_f32(zero_mask, sumSquared, magnitude);
337339

338-
float32x4_t scaled = vmulq_f32(magnitude, vScalar);
340+
// Magnitude is always non-negative, so just add 0.5 for rounding
341+
float32x4_t scaled = vaddq_f32(vmulq_f32(magnitude, vScalar), half);
339342
int32x4_t intVal = vcvtq_s32_f32(scaled);
340343
int16x4_t shortVal = vqmovn_s32(intVal);
341344

kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h

Lines changed: 0 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -55,63 +55,6 @@ static inline void volk_32fc_s32fc_rotator2puppet_32fc_neon(lv_32fc_t* outVector
5555
#endif /* LV_HAVE_NEON */
5656

5757

58-
#ifdef LV_HAVE_NEONV8
59-
#include <arm_neon.h>
60-
61-
static inline void volk_32fc_s32fc_rotator2puppet_32fc_neonv8(lv_32fc_t* outVector,
62-
const lv_32fc_t* inVector,
63-
const lv_32fc_t* phase_inc,
64-
unsigned int num_points)
65-
{
66-
lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
67-
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
68-
const lv_32fc_t phase_inc_n =
69-
*phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
70-
volk_32fc_s32fc_x2_rotator2_32fc_neonv8(
71-
outVector, inVector, &phase_inc_n, phase, num_points);
72-
}
73-
#endif /* LV_HAVE_NEONV8 */
74-
75-
76-
#ifdef LV_HAVE_SSE4_1
77-
#include <smmintrin.h>
78-
79-
static inline void
80-
volk_32fc_s32fc_rotator2puppet_32fc_a_sse4_1(lv_32fc_t* outVector,
81-
const lv_32fc_t* inVector,
82-
const lv_32fc_t* phase_inc,
83-
unsigned int num_points)
84-
{
85-
lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
86-
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
87-
const lv_32fc_t phase_inc_n =
88-
*phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
89-
volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(
90-
outVector, inVector, &phase_inc_n, phase, num_points);
91-
}
92-
93-
#endif /* LV_HAVE_SSE4_1 */
94-
95-
96-
#ifdef LV_HAVE_SSE4_1
97-
#include <smmintrin.h>
98-
static inline void
99-
volk_32fc_s32fc_rotator2puppet_32fc_u_sse4_1(lv_32fc_t* outVector,
100-
const lv_32fc_t* inVector,
101-
const lv_32fc_t* phase_inc,
102-
unsigned int num_points)
103-
{
104-
lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
105-
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
106-
const lv_32fc_t phase_inc_n =
107-
*phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
108-
volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(
109-
outVector, inVector, &phase_inc_n, phase, num_points);
110-
}
111-
112-
#endif /* LV_HAVE_SSE4_1 */
113-
114-
11558
#ifdef LV_HAVE_AVX
11659
#include <immintrin.h>
11760

@@ -189,45 +132,6 @@ volk_32fc_s32fc_rotator2puppet_32fc_u_avx512f(lv_32fc_t* outVector,
189132

190133
#endif /* LV_HAVE_AVX512F */
191134

192-
#if LV_HAVE_AVX && LV_HAVE_FMA
193-
#include <immintrin.h>
194-
195-
static inline void
196-
volk_32fc_s32fc_rotator2puppet_32fc_a_avx_fma(lv_32fc_t* outVector,
197-
const lv_32fc_t* inVector,
198-
const lv_32fc_t* phase_inc,
199-
unsigned int num_points)
200-
{
201-
lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
202-
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
203-
const lv_32fc_t phase_inc_n =
204-
*phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
205-
volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(
206-
outVector, inVector, &phase_inc_n, phase, num_points);
207-
}
208-
209-
#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
210-
211-
212-
#if LV_HAVE_AVX && LV_HAVE_FMA
213-
#include <immintrin.h>
214-
215-
static inline void
216-
volk_32fc_s32fc_rotator2puppet_32fc_u_avx_fma(lv_32fc_t* outVector,
217-
const lv_32fc_t* inVector,
218-
const lv_32fc_t* phase_inc,
219-
unsigned int num_points)
220-
{
221-
lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
222-
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
223-
const lv_32fc_t phase_inc_n =
224-
*phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
225-
volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(
226-
outVector, inVector, &phase_inc_n, phase, num_points);
227-
}
228-
229-
#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
230-
231135
#ifdef LV_HAVE_RVV
232136
static inline void volk_32fc_s32fc_rotator2puppet_32fc_rvv(lv_32fc_t* outVector,
233137
const lv_32fc_t* inVector,

0 commit comments

Comments
 (0)