Merge pull request #760 from Ka-zam/rcp_avx512

New AVX512F implementation
gnuradio · Mar 30, 2024 · 8a015bb · 8a015bb
2 parents 5658541 + 89bfc55
commit 8a015bb
Show file tree

Hide file tree

Showing 2 changed files with 203 additions and 1 deletion.
diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h
@@ -0,0 +1,201 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2024 Magnus Lundmark <[email protected]>
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_reciprocal_32f
+ *
+ * \b Overview
+ *
+ * Computes the reciprocal of the input vector and stores the results
+ * in the output vector. For the AVX512F implementation the relative
+ * error is < 2**(-14) = 6.1e-05
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_reciprocal_32f(float* out, const float* in, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li in: A pointer to the input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: A pointer to the output vector of floats.
+ *
+ * \b Example
+ * \code
+    int N = 10;
+    unsigned int alignment = volk_get_alignment();
+    float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+    float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+
+    for(unsigned int ii = 1; ii < N; ++ii){
+        in[ii] = (float)(ii*ii);
+    }
+
+    volk_32f_reciprocal_32f(out, in, N);
+
+    for(unsigned int ii = 0; ii < N; ++ii){
+        printf("out(%i) = %f\n", ii, out[ii]);
+    }
+
+    volk_free(in);
+    volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H
+#define INCLUDED_volk_32f_reciprocal_32f_a_H
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
+{
+    for (unsigned int i = 0; i < num_points; i++) {
+        out[i] = 1.f / in[i];
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+static inline void
+volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_points)
+{
+    const __m128 ONE = _mm_set_ps1(1.f);
+    const unsigned int quarter_points = num_points / 4;
+
+    for (unsigned int number = 0; number < quarter_points; number++) {
+        __m128 x = _mm_load_ps(in);
+        in += 4;
+        __m128 r = _mm_div_ps(ONE, x);
+        _mm_store_ps(out, r);
+        out += 4;
+    }
+
+    const unsigned int done = quarter_points * 4;
+
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
+{
+    const __m256 ONE = _mm256_set1_ps(1.f);
+    const unsigned int eighth_points = num_points / 8;
+
+    for (unsigned int number = 0; number < eighth_points; number++) {
+        __m256 x = _mm256_load_ps(in);
+        in += 8;
+        __m256 r = _mm256_div_ps(ONE, x);
+        _mm256_store_ps(out, r);
+        out += 8;
+    }
+
+    const unsigned int done = eighth_points * 8;
+
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points)
+{
+    const unsigned int sixteenth_points = num_points / 16;
+
+    for (unsigned int number = 0; number < sixteenth_points; number++) {
+        __m512 x = _mm512_load_ps(in);
+        in += 16;
+        __m512 r = _mm512_rcp14_ps(x);
+        _mm512_store_ps(out, r);
+        out += 16;
+    }
+
+    const unsigned int done = sixteenth_points * 16;
+
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_AVX512F */
+
+#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H
+#define INCLUDED_volk_32f_reciprocal_32f_u_H
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+static inline void
+volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_points)
+{
+    const __m128 ONE = _mm_set_ps1(1.f);
+    const unsigned int quarter_points = num_points / 4;
+
+    for (unsigned int number = 0; number < quarter_points; number++) {
+        __m128 x = _mm_loadu_ps(in);
+        in += 4;
+        __m128 r = _mm_div_ps(ONE, x);
+        _mm_storeu_ps(out, r);
+        out += 4;
+    }
+
+    const unsigned int done = quarter_points * 4;
+
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
+{
+    const __m256 ONE = _mm256_set1_ps(1.f);
+    const unsigned int eighth_points = num_points / 8;
+
+    for (unsigned int number = 0; number < eighth_points; number++) {
+        __m256 x = _mm256_loadu_ps(in);
+        in += 8;
+        __m256 r = _mm256_div_ps(ONE, x);
+        _mm256_storeu_ps(out, r);
+        out += 8;
+    }
+
+    const unsigned int done = eighth_points * 8;
+
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points)
+{
+    const unsigned int sixteenth_points = num_points / 16;
+
+    for (unsigned int number = 0; number < sixteenth_points; number++) {
+        __m512 x = _mm512_loadu_ps(in);
+        in += 16;
+        __m512 r = _mm512_rcp14_ps(x);
+        _mm512_storeu_ps(out, r);
+        out += 16;
+    }
+
+    const unsigned int done = sixteenth_points * 16;
+
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_AVX512F */
+
+#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2014 - 2021 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <[email protected]>
+ * Copyright 2023, 2024 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -141,6 +141,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
     QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params))
     QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
     QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_reciprocal_32f, test_params.make_tol(6.15e-5)))
     QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
     QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
     QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_absolute(1e-5)))