multiple fixes

Signed-off-by: Magnus Lundmark <[email protected]>
gnuradio · Sep 26, 2023 · 3ce5fb0 · 3ce5fb0
1 parent 4840e4d
commit 3ce5fb0
Show file tree

Hide file tree

Showing 7 changed files with 89 additions and 67 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,6 @@
 #
 # Copyright 2011-2020 Free Software Foundation, Inc.
+# Copyright 2023 Magnus Lundmark <[email protected]>
 #
 # This file is part of VOLK
 #
@@ -248,6 +249,7 @@ install(FILES
     ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h

diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
@@ -0,0 +1,50 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2023 Magnus Lundmark <[email protected]>
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold AVX2 FMA intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
+#include <immintrin.h>
+
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x)
+{
+    const __m256 a1 = _mm256_set1_ps(+0.999999348f);
+    const __m256 a3 = _mm256_set1_ps(-0.333265149f);
+    const __m256 a5 = _mm256_set1_ps(+0.198814825f);
+    const __m256 a7 = _mm256_set1_ps(-0.134871915f);
+    const __m256 a9 = _mm256_set1_ps(+0.08387119203f);
+    const __m256 a11 = _mm256_set1_ps(-0.0370130021f);
+    const __m256 a13 = _mm256_set1_ps(+0.00786337701f);
+
+    const __m256 x_times_x = _mm256_mul_ps(x, x);
+    __m256 arctan;
+    arctan = a13;
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
+    arctan = _mm256_mul_ps(x, arctan);
+
+    return arctan;
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h
@@ -1,6 +1,6 @@
 /* -*- c++ -*- */
 /*
- * Copyright 2015, 2023 Free Software Foundation, Inc.
+ * Copyright 2015 Free Software Foundation, Inc.
  *
  * This file is part of VOLK
  *
@@ -17,37 +17,6 @@
 #include "volk/volk_avx_intrinsics.h"
 #include <immintrin.h>
 
-/*
- * Approximate arctan(x) via polynomial expansion
- * on the interval [-1, 1]
- *
- * Maximum relative error ~6.5e-7
- * Polynomial evaluated via Horner's method
- */
-static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x)
-{
-    const __m256 a1 = _mm256_set1_ps(+0.999999348f);
-    const __m256 a3 = _mm256_set1_ps(-0.333265149f);
-    const __m256 a5 = _mm256_set1_ps(+0.198814825f);
-    const __m256 a7 = _mm256_set1_ps(-0.134871915f);
-    const __m256 a9 = _mm256_set1_ps(+0.08387119203f);
-    const __m256 a11 = _mm256_set1_ps(-0.0370130021f);
-    const __m256 a13 = _mm256_set1_ps(+0.00786337701f);
-
-    const __m256 x_times_x = _mm256_mul_ps(x, x);
-    __m256 arctan;
-    arctan = a13;
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
-    arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
-    arctan = _mm256_mul_ps(x, arctan);
-
-    return arctan;
-}
-
 static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
 {
     const __m128i zeros = _mm_set1_epi8(0x00);

diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
- * Copyright 2015, 20233 Free Software Foundation, Inc.
+ * Copyright 2015 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *

diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
- * Copyright 2015, 2023 Free Software Foundation, Inc.
+ * Copyright 2015 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *

diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
- * Copyright 2014, 2023 Free Software Foundation, Inc.
+ * Copyright 2014 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -53,7 +54,6 @@
  *   volk_free(out);
  * \endcode
  */
-#include <inttypes.h>
 #include <math.h>
 
 #define POLY_ORDER (13) // Use either 11, 13 or 15
@@ -74,8 +74,7 @@ static inline float arctan_approximation(const float x)
     const float a11 = -0.0134804696144362610026f;
 
     const float x_times_x = x * x;
-    float arctan;
-    arctan = a11;
+    float arctan = a11;
     arctan = fmaf(x_times_x, arctan, a9);
     arctan = fmaf(x_times_x, arctan, a7);
     arctan = fmaf(x_times_x, arctan, a5);
@@ -100,8 +99,7 @@ static inline float arctan_approximation(const float x)
     const float a13 = +0.0078633770069189183298f;
 
     const float x_times_x = x * x;
-    float arctan;
-    arctan = a13;
+    float arctan = a13;
     arctan = fmaf(x_times_x, arctan, a11);
     arctan = fmaf(x_times_x, arctan, a9);
     arctan = fmaf(x_times_x, arctan, a7);
@@ -128,8 +126,7 @@ static inline float arctan_approximation(const float x)
     const float a15 = -0.00469327610039088433147f;
 
     const float x_times_x = x * x;
-    float arctan;
-    arctan = a15;
+    float arctan = a15;
     arctan = fmaf(x_times_x, arctan, a13);
     arctan = fmaf(x_times_x, arctan, a11);
     arctan = fmaf(x_times_x, arctan, a9);
@@ -151,30 +148,30 @@ static inline float arctan_approximation(const float x)
 static inline float arctan(const float x)
 {
     /*
-     * arctan(x) + arctan(1 / x) = sign(x) * pi / 2
+     *  arctan(x) + arctan(1 / x) == sign(x) * pi / 2
      */
     const float pi_over_2 = M_PI_2;
 
     if (fabs(x) <= 1.f) {
         return arctan_approximation(x);
     } else {
-        float term = (x > 0.f) ? pi_over_2 : -pi_over_2;
-        return term - arctan_approximation(1.f / x);
+        return copysignf(pi_over_2, x) - arctan_approximation(1.f / x);
     }
 }
 
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
+#include <volk/volk_avx2_fma_intrinsics.h>
 static inline void
-volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, uint32_t num_points)
+volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
 {
     const __m256 one = _mm256_set1_ps(1.f);
     const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
     const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
     const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
-    uint32_t number = 0;
-    uint32_t eighth_points = num_points / 8;
+    unsigned int number = 0;
+    unsigned int eighth_points = num_points / 8;
     for (; number < eighth_points; number++) {
         __m256 x = _mm256_load_ps(in);
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
@@ -199,16 +196,17 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, uint32_t num_points)
 
 #if LV_HAVE_AVX
 #include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
 static inline void
-volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
+volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
 {
     const __m256 one = _mm256_set1_ps(1.f);
     const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
     const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
     const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
-    uint32_t number = 0;
-    uint32_t eighth_points = num_points / 8;
+    unsigned int number = 0;
+    unsigned int eighth_points = num_points / 8;
     for (; number < eighth_points; number++) {
         __m256 x = _mm256_load_ps(in);
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
@@ -235,15 +233,15 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
 #include <smmintrin.h>
 #include <volk/volk_sse_intrinsics.h>
 static inline void
-volk_32f_atan_32f_a_sse4_1(float* out, const float* in, uint32_t num_points)
+volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
 {
     const __m128 one = _mm_set1_ps(1.f);
     const __m128 pi_over_2 = _mm_set1_ps(M_PI_2);
     const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
     const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
-    uint32_t number = 0;
-    uint32_t quarter_points = num_points / 4;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
     for (; number < quarter_points; number++) {
         __m128 x = _mm_load_ps(in);
         __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
@@ -273,15 +271,15 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, uint32_t num_points)
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
 static inline void
-volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, uint32_t num_points)
+volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
 {
     const __m256 one = _mm256_set1_ps(1.f);
     const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
     const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
     const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
-    uint32_t number = 0;
-    uint32_t eighth_points = num_points / 8;
+    unsigned int number = 0;
+    unsigned int eighth_points = num_points / 8;
     for (; number < eighth_points; number++) {
         __m256 x = _mm256_loadu_ps(in);
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
@@ -307,15 +305,15 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, uint32_t num_points)
 #if LV_HAVE_AVX
 #include <immintrin.h>
 static inline void
-volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
+volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
 {
     const __m256 one = _mm256_set1_ps(1.f);
     const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
     const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
     const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
-    uint32_t number = 0;
-    uint32_t eighth_points = num_points / 8;
+    unsigned int number = 0;
+    unsigned int eighth_points = num_points / 8;
     for (; number < eighth_points; number++) {
         __m256 x = _mm256_loadu_ps(in);
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
@@ -342,15 +340,15 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
 #include <smmintrin.h>
 #include <volk/volk_sse_intrinsics.h>
 static inline void
-volk_32f_atan_32f_u_sse4_1(float* out, const float* in, uint32_t num_points)
+volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
 {
     const __m128 one = _mm_set1_ps(1.f);
     const __m128 pi_over_2 = _mm_set1_ps(M_PI_2);
     const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
     const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
-    uint32_t number = 0;
-    uint32_t quarter_points = num_points / 4;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
     for (; number < quarter_points; number++) {
         __m128 x = _mm_loadu_ps(in);
         __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
@@ -375,9 +373,9 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, uint32_t num_points)
 
 #ifdef LV_HAVE_GENERIC
 static inline void
-volk_32f_atan_32f_polynomial(float* out, const float* in, uint32_t num_points)
+volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
 {
-    uint32_t number = 0;
+    unsigned int number = 0;
     for (; number < num_points; number++) {
         *out++ = arctan(*in++);
     }
@@ -386,9 +384,9 @@ volk_32f_atan_32f_polynomial(float* out, const float* in, uint32_t num_points)
 
 #ifdef LV_HAVE_GENERIC
 static inline void
-volk_32f_atan_32f_generic(float* out, const float* in, uint32_t num_points)
+volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
 {
-    uint32_t number = 0;
+    unsigned int number = 0;
     for (; number < num_points; number++) {
         *out++ = atanf(*in++);
     }

diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2014 - 2021 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *