Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions libairspy/src/airspy.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSI
#include "iqconverter_int16.h"
#include "filters.h"

#if defined(__ARM_NEON) && __ARM_NEON
#include <arm_neon.h>
#define USE_NEON
#endif

#ifndef bool
typedef int bool;
#define true 1
Expand Down Expand Up @@ -306,13 +311,33 @@ static void convert_samples_int16(uint16_t *src, int16_t *dest, int count)

static void convert_samples_float(uint16_t *src, float *dest, int count)
{
#if defined(USE_NEON)
const float32x4_t offset_f32 = vmovq_n_f32(2048);
const float32x4_t sample_scale_f32 = vmovq_n_f32(SAMPLE_SCALE);
#endif

int i;
for (i = 0; i < count; i += 4)
{

#if defined(USE_NEON)

const uint16x4_t src_u16 = vld1_u16(src + i);
const uint32x4_t src_u32 = vmovl_u16(src_u16);
const float32x4_t src_f32 = vcvtq_f32_u32(src_u32);
const float32x4_t src_offset_f32 = vsubq_f32(src_f32, offset_f32);
const float32x4_t dest_f32 = vmulq_f32(src_offset_f32, sample_scale_f32);

vst1q_f32(dest + i, dest_f32);

#else

dest[i + 0] = (src[i + 0] - 2048) * SAMPLE_SCALE;
dest[i + 1] = (src[i + 1] - 2048) * SAMPLE_SCALE;
dest[i + 2] = (src[i + 2] - 2048) * SAMPLE_SCALE;
dest[i + 3] = (src[i + 3] - 2048) * SAMPLE_SCALE;

#endif
}
}

Expand Down
98 changes: 92 additions & 6 deletions libairspy/src/iqconverter_float.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ void *_aligned_malloc(size_t size, size_t alignment)
#endif
#endif

#if defined(__ARM_NEON) && __ARM_NEON
#include <arm_neon.h>
#define USE_NEON
#endif

#define SIZE_FACTOR 32
#define DEFAULT_ALIGNMENT 16
#define HPF_COEFF 0.01f
Expand Down Expand Up @@ -122,6 +127,10 @@ static _inline float process_fir_taps(const float *kernel, const float *queue, i

__m128 acc = _mm_set_ps(0, 0, 0, 0);

#elif defined(USE_NEON)

float32x4_t acc = vmovq_n_f32(0);

#else

float sum = 0.0f;
Expand Down Expand Up @@ -150,6 +159,21 @@ static _inline float process_fir_taps(const float *kernel, const float *queue, i

queue += 8;
kernel += 8;

#elif defined(USE_NEON)

for (i = 0; i < it; i++)
{
const float32x4_t head1 = vld1q_f32(queue);
const float32x4_t kern1 = vld1q_f32(kernel);
const float32x4_t head2 = vld1q_f32(queue + 4);
const float32x4_t kern2 = vld1q_f32(kernel + 4);

acc = vmlaq_f32(acc, kern1, head1);
acc = vmlaq_f32(acc, kern2, head2);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I think I found a performance issue here. If you use the same accumulator for both FMLA operations, then CPU won't be able to parallelise them. Even if all quad registers are parallel. Try setting up 2 accumulators and sum them up after the loop. It should give you ~2x performance boost.

I did some analysis here for different number of quad registers: https://dernasherbrezon.com/posts/fir-filter-optimization-simd/

Screenshot 2023-06-27 at 00 00 03

Copy link
Contributor Author

@sergeyvfx sergeyvfx Jun 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Nice to see someone is digging deeper into the patch! :)

Indeed using multiple accumulators will help in this function. I've committed a small change for it. So you might want to give it another whirl!

Btw, the article is pretty cool! There is one thing to clarify though: our time measurements are different. I was measuring the overall processing time which happens in the consumer_threadproc when running ./airspy-tools/src/airspy_rx -t 0 -f 50 -r output.wav, and not the speedup of individual functions. It seemed to be more practical measure, which more closely resembles the amount of freed-up resources for other calculations.

The code I used for benchmarking is in the benchmark branch of my fork: sergeyvfx/airspyone_host@2b6f827f714
One thing to note is that while it seemed to work fine on Raspberry Pi back then, now I was unable to get reliable number on Apple M2. There might be some stupid mistake in the timing code.

Edit: It might worth mentioning. In the airspy_rx test I've mentioned above the process_fir_taps does not seem to be used, it is things like fir_interleaved_24 seems to be a hotspot.
Edit 2: It actually worth double-checking which of the code paths are used with the default configuration. Because the default filter size is 47, so it is not really obvious why the 24 element specialization is used. Don't currently have access to the hardware to verify.
Edit 3: Turned out to be easy: cnv->len = len / 2 + 1; in the iqconverter_float_create.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

iqconverter_float_process is the bottleneck. You can run Instruments application from Xcode and connect to the running airspy_rx.

Screenshot 2023-06-27 at 22 18 51

FIR filter can be optimised, but one of the most heavily loaded function remove_dc cannot. It is not SIMD-friendly because on each iteration it uses the result from the previous operation. I tried to figure out how to algorithmically change it, but cannot understand how it removes DC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of the ways to remove DC from a signal is to remove its average from the signal. That is effectively what is happening in the remove_dc. It calculates the average by some sort of simplified exponential moving average. Typically you'd see a Lerp, but this FMA style of average update works good enough and is cheaper.

It also seems memmove has considerable contribution to the timing? Perhaps something like double-bufferred circular buffer will help. Will probably help much more on Raspberry Pi, where memory transfers are not that fast. The tricky part is that such approach will not be usable for SSE path, as it ruins data alignment.


queue += 8;
kernel += 8;
}

#else
Expand Down Expand Up @@ -183,6 +207,12 @@ static _inline float process_fir_taps(const float *kernel, const float *queue, i
__m128 mul = _mm_mul_ps(kern, head);
acc = _mm_add_ps(acc, mul);

#elif defined(USE_NEON)

const float32x4_t head = vld1q_f32(queue);
const float32x4_t kern = vld1q_f32(kernel);
acc = vmlaq_f32(acc, kern, head);

#else

sum += kernel[0] * queue[0]
Expand All @@ -208,6 +238,8 @@ static _inline float process_fir_taps(const float *kernel, const float *queue, i
float sum = acc.m128_f32[0];
#endif

#elif defined(USE_NEON)
float sum = vaddvq_f32(acc);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Compilation fails on RaspberryPI. This instruction is from A64 instruction set (aarch64), while RaspberryPI is 32bit.https://developer.arm.com/architectures/instruction-sets/intrinsics/#q=vaddvq_f32

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch!
I've committed some tweaks, which should hopefully fix the issue. Unfortunately, do not currently have 32bit ARM platform handy, so could not verify whether there are other fixes needed.

#endif

if (len >= 2)
Expand Down Expand Up @@ -327,6 +359,14 @@ static void fir_interleaved_12(iqconverter_float_t *cnv, float *samples, int len
cnv->fir_index = fir_index;
}

#if defined(USE_NEON)
static inline float32x4_t vld1q_f32_reversed(const float* values) {
float32x4_t v = vld1q_f32(values); /* 0 1 2 3 */
v = vrev64q_f32(v); /* 1 0 3 2 */
return vextq_f32(v, v, 2); /* 3 2 1 0 */
}
#endif

static void fir_interleaved_24(iqconverter_float_t *cnv, float *samples, int len)
{
int i;
Expand All @@ -335,14 +375,39 @@ static void fir_interleaved_24(iqconverter_float_t *cnv, float *samples, int len
float *fir_kernel = cnv->fir_kernel;
float *fir_queue = cnv->fir_queue;
float *queue;

#if defined(USE_NEON)
const float32x4_t kernel1 = vld1q_f32(fir_kernel + 0);
const float32x4_t kernel2 = vld1q_f32(fir_kernel + 4);
const float32x4_t kernel3 = vld1q_f32(fir_kernel + 8);

float32x4_t acc;
#else
float acc = 0;
#endif

for (i = 0; i < len; i += 2)
{
queue = fir_queue + fir_index;

queue[0] = samples[i];

#if defined(USE_NEON)

const float32x4_t queue1_1 = vld1q_f32(queue + 0);
const float32x4_t queue2_1 = vld1q_f32(queue + 4);
const float32x4_t queue3_1 = vld1q_f32(queue + 8);

const float32x4_t queue3_2 = vld1q_f32_reversed(queue + 12);
const float32x4_t queue2_2 = vld1q_f32_reversed(queue + 16);
const float32x4_t queue1_2 = vld1q_f32_reversed(queue + 20);

acc = vmulq_f32(kernel1, vaddq_f32(queue1_1, queue1_2));
acc = vmlaq_f32(acc, kernel2, vaddq_f32(queue2_1, queue2_2));
acc = vmlaq_f32(acc, kernel3, vaddq_f32(queue3_1, queue3_2));

samples[i] = vaddvq_f32(acc);
#else
acc = fir_kernel[0] * (queue[0] + queue[24 - 1])
+ fir_kernel[1] * (queue[1] + queue[24 - 2])
+ fir_kernel[2] * (queue[2] + queue[24 - 3])
Expand All @@ -357,6 +422,7 @@ static void fir_interleaved_24(iqconverter_float_t *cnv, float *samples, int len
+ fir_kernel[11] * (queue[11] + queue[24 - 12]);

samples[i] = acc;
#endif

if (--fir_index < 0)
{
Expand Down Expand Up @@ -446,21 +512,23 @@ static void delay_interleaved(iqconverter_float_t *cnv, float *samples, int len)

static void remove_dc(iqconverter_float_t *cnv, float *samples, int len)
{
int i;
float *sample = samples;
const float *samples_end = sample + len;

ALIGNED float avg = cnv->avg;

for (i = 0; i < len; i++)
while (sample < samples_end)
{
samples[i] -= avg;
avg += SCALE * samples[i];
*sample -= avg;
avg += SCALE * (*sample);
++sample;
}

cnv->avg = avg;
}

static void translate_fs_4(iqconverter_float_t *cnv, float *samples, int len)
{
int i;
ALIGNED float hbc = cnv->hbc;

#ifdef USE_SSE2
Expand All @@ -476,9 +544,27 @@ static void translate_fs_4(iqconverter_float_t *cnv, float *samples, int len)
_mm_storeu_ps(buf, vec);
}

#elif defined(USE_NEON)

float *buf = samples;
const float *buf_end = buf + len;
float32x4_t vec;
float rot_data[4] = {-1.0f, -hbc, 1.0f, hbc};
const float32x4_t rot = vld1q_f32(rot_data);

while (buf < buf_end)
{
vec = vld1q_f32(buf);

vec = vmulq_f32(vec, rot);
vst1q_f32(buf, vec);

buf += 4;
}

#else

int j;
int i, j;

for (i = 0; i < len / 4; i++)
{
Expand Down