-
Notifications
You must be signed in to change notification settings - Fork 98
Optimize float32 IQ code path using ARM NEON #89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
8916439
2dc5463
9467f39
204228a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -62,6 +62,11 @@ void *_aligned_malloc(size_t size, size_t alignment) | |
| #endif | ||
| #endif | ||
|
|
||
| #if defined(__ARM_NEON) && __ARM_NEON | ||
| #include <arm_neon.h> | ||
| #define USE_NEON | ||
| #endif | ||
|
|
||
| #define SIZE_FACTOR 32 | ||
| #define DEFAULT_ALIGNMENT 16 | ||
| #define HPF_COEFF 0.01f | ||
|
|
@@ -122,6 +127,10 @@ static _inline float process_fir_taps(const float *kernel, const float *queue, i | |
|
|
||
| __m128 acc = _mm_set_ps(0, 0, 0, 0); | ||
|
|
||
| #elif defined(USE_NEON) | ||
|
|
||
| float32x4_t acc = vmovq_n_f32(0); | ||
|
|
||
| #else | ||
|
|
||
| float sum = 0.0f; | ||
|
|
@@ -150,6 +159,21 @@ static _inline float process_fir_taps(const float *kernel, const float *queue, i | |
|
|
||
| queue += 8; | ||
| kernel += 8; | ||
|
|
||
| #elif defined(USE_NEON) | ||
|
|
||
| for (i = 0; i < it; i++) | ||
| { | ||
| const float32x4_t head1 = vld1q_f32(queue); | ||
| const float32x4_t kern1 = vld1q_f32(kernel); | ||
| const float32x4_t head2 = vld1q_f32(queue + 4); | ||
| const float32x4_t kern2 = vld1q_f32(kernel + 4); | ||
|
|
||
| acc = vmlaq_f32(acc, kern1, head1); | ||
| acc = vmlaq_f32(acc, kern2, head2); | ||
|
|
||
| queue += 8; | ||
| kernel += 8; | ||
| } | ||
|
|
||
| #else | ||
|
|
@@ -183,6 +207,12 @@ static _inline float process_fir_taps(const float *kernel, const float *queue, i | |
| __m128 mul = _mm_mul_ps(kern, head); | ||
| acc = _mm_add_ps(acc, mul); | ||
|
|
||
| #elif defined(USE_NEON) | ||
|
|
||
| const float32x4_t head = vld1q_f32(queue); | ||
| const float32x4_t kern = vld1q_f32(kernel); | ||
| acc = vmlaq_f32(acc, kern, head); | ||
|
|
||
| #else | ||
|
|
||
| sum += kernel[0] * queue[0] | ||
|
|
@@ -208,6 +238,8 @@ static _inline float process_fir_taps(const float *kernel, const float *queue, i | |
| float sum = acc.m128_f32[0]; | ||
| #endif | ||
|
|
||
| #elif defined(USE_NEON) | ||
| float sum = vaddvq_f32(acc); | ||
|
||
| #endif | ||
|
|
||
| if (len >= 2) | ||
|
|
@@ -327,6 +359,14 @@ static void fir_interleaved_12(iqconverter_float_t *cnv, float *samples, int len | |
| cnv->fir_index = fir_index; | ||
| } | ||
|
|
||
| #if defined(USE_NEON) | ||
| static inline float32x4_t vld1q_f32_reversed(const float* values) { | ||
| float32x4_t v = vld1q_f32(values); /* 0 1 2 3 */ | ||
| v = vrev64q_f32(v); /* 1 0 3 2 */ | ||
| return vextq_f32(v, v, 2); /* 3 2 1 0 */ | ||
| } | ||
| #endif | ||
|
|
||
| static void fir_interleaved_24(iqconverter_float_t *cnv, float *samples, int len) | ||
| { | ||
| int i; | ||
|
|
@@ -335,14 +375,39 @@ static void fir_interleaved_24(iqconverter_float_t *cnv, float *samples, int len | |
| float *fir_kernel = cnv->fir_kernel; | ||
| float *fir_queue = cnv->fir_queue; | ||
| float *queue; | ||
|
|
||
| #if defined(USE_NEON) | ||
| const float32x4_t kernel1 = vld1q_f32(fir_kernel + 0); | ||
| const float32x4_t kernel2 = vld1q_f32(fir_kernel + 4); | ||
| const float32x4_t kernel3 = vld1q_f32(fir_kernel + 8); | ||
|
|
||
| float32x4_t acc; | ||
| #else | ||
| float acc = 0; | ||
| #endif | ||
|
|
||
| for (i = 0; i < len; i += 2) | ||
| { | ||
| queue = fir_queue + fir_index; | ||
|
|
||
| queue[0] = samples[i]; | ||
|
|
||
| #if defined(USE_NEON) | ||
|
|
||
| const float32x4_t queue1_1 = vld1q_f32(queue + 0); | ||
| const float32x4_t queue2_1 = vld1q_f32(queue + 4); | ||
| const float32x4_t queue3_1 = vld1q_f32(queue + 8); | ||
|
|
||
| const float32x4_t queue3_2 = vld1q_f32_reversed(queue + 12); | ||
| const float32x4_t queue2_2 = vld1q_f32_reversed(queue + 16); | ||
| const float32x4_t queue1_2 = vld1q_f32_reversed(queue + 20); | ||
|
|
||
| acc = vmulq_f32(kernel1, vaddq_f32(queue1_1, queue1_2)); | ||
| acc = vmlaq_f32(acc, kernel2, vaddq_f32(queue2_1, queue2_2)); | ||
| acc = vmlaq_f32(acc, kernel3, vaddq_f32(queue3_1, queue3_2)); | ||
|
|
||
| samples[i] = vaddvq_f32(acc); | ||
| #else | ||
| acc = fir_kernel[0] * (queue[0] + queue[24 - 1]) | ||
| + fir_kernel[1] * (queue[1] + queue[24 - 2]) | ||
| + fir_kernel[2] * (queue[2] + queue[24 - 3]) | ||
|
|
@@ -357,6 +422,7 @@ static void fir_interleaved_24(iqconverter_float_t *cnv, float *samples, int len | |
| + fir_kernel[11] * (queue[11] + queue[24 - 12]); | ||
|
|
||
| samples[i] = acc; | ||
| #endif | ||
|
|
||
| if (--fir_index < 0) | ||
| { | ||
|
|
@@ -446,21 +512,23 @@ static void delay_interleaved(iqconverter_float_t *cnv, float *samples, int len) | |
|
|
||
| static void remove_dc(iqconverter_float_t *cnv, float *samples, int len) | ||
| { | ||
| int i; | ||
| float *sample = samples; | ||
| const float *samples_end = sample + len; | ||
|
|
||
| ALIGNED float avg = cnv->avg; | ||
|
|
||
| for (i = 0; i < len; i++) | ||
| while (sample < samples_end) | ||
| { | ||
| samples[i] -= avg; | ||
| avg += SCALE * samples[i]; | ||
| *sample -= avg; | ||
| avg += SCALE * (*sample); | ||
| ++sample; | ||
| } | ||
|
|
||
| cnv->avg = avg; | ||
| } | ||
|
|
||
| static void translate_fs_4(iqconverter_float_t *cnv, float *samples, int len) | ||
| { | ||
| int i; | ||
| ALIGNED float hbc = cnv->hbc; | ||
|
|
||
| #ifdef USE_SSE2 | ||
|
|
@@ -476,9 +544,27 @@ static void translate_fs_4(iqconverter_float_t *cnv, float *samples, int len) | |
| _mm_storeu_ps(buf, vec); | ||
| } | ||
|
|
||
| #elif defined(USE_NEON) | ||
|
|
||
| float *buf = samples; | ||
| const float *buf_end = buf + len; | ||
| float32x4_t vec; | ||
| float rot_data[4] = {-1.0f, -hbc, 1.0f, hbc}; | ||
| const float32x4_t rot = vld1q_f32(rot_data); | ||
|
|
||
| while (buf < buf_end) | ||
| { | ||
| vec = vld1q_f32(buf); | ||
|
|
||
| vec = vmulq_f32(vec, rot); | ||
| vst1q_f32(buf, vec); | ||
|
|
||
| buf += 4; | ||
| } | ||
|
|
||
| #else | ||
|
|
||
| int j; | ||
| int i, j; | ||
|
|
||
| for (i = 0; i < len / 4; i++) | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually I think I found a performance issue here. If you use the same accumulator for both FMLA operations, then CPU won't be able to parallelise them. Even if all quad registers are parallel. Try setting up 2 accumulators and sum them up after the loop. It should give you ~2x performance boost.
I did some analysis here for different number of quad registers: https://dernasherbrezon.com/posts/fir-filter-optimization-simd/
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! Nice to see someone is digging deeper into the patch! :)
Indeed using multiple accumulators will help in this function. I've committed a small change for it. So you might want to give it another whirl!
Btw, the article is pretty cool! There is one thing to clarify though: our time measurements are different. I was measuring the overall processing time which happens in the
consumer_threadprocwhen running./airspy-tools/src/airspy_rx -t 0 -f 50 -r output.wav, and not the speedup of individual functions. It seemed to be more practical measure, which more closely resembles the amount of freed-up resources for other calculations.The code I used for benchmarking is in the
benchmarkbranch of my fork: sergeyvfx/airspyone_host@2b6f827f714One thing to note is that while it seemed to work fine on Raspberry Pi back then, now I was unable to get reliable number on Apple M2. There might be some stupid mistake in the timing code.
Edit: It might worth mentioning. In the
airspy_rxtest I've mentioned above theprocess_fir_tapsdoes not seem to be used, it is things likefir_interleaved_24seems to be a hotspot.Edit 2: It actually worth double-checking which of the code paths are used with the default configuration. Because the default filter size is 47, so it is not really obvious why the 24 element specialization is used. Don't currently have access to the hardware to verify.
Edit 3: Turned out to be easy:
cnv->len = len / 2 + 1;in theiqconverter_float_create.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
iqconverter_float_processis the bottleneck. You can runInstrumentsapplication from Xcode and connect to the running airspy_rx.FIR filter can be optimised, but one of the most heavily loaded function
remove_dccannot. It is not SIMD-friendly because on each iteration it uses the result from the previous operation. I tried to figure out how to algorithmically change it, but cannot understand how it removes DC.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One of the ways to remove DC from a signal is to remove its average from the signal. That is effectively what is happening in the
remove_dc. It calculates the average by some sort of simplified exponential moving average. Typically you'd see a Lerp, but this FMA style of average update works good enough and is cheaper.It also seems
memmovehas considerable contribution to the timing? Perhaps something like double-bufferred circular buffer will help. Will probably help much more on Raspberry Pi, where memory transfers are not that fast. The tricky part is that such approach will not be usable for SSE path, as it ruins data alignment.