Merge pull request #633 from Ka-zam/arctan

New kernels for arctan
gnuradio · Oct 3, 2023 · a26a1b8 · a26a1b8
2 parents 74b6c6a + 0343e3c
commit a26a1b8
Show file tree

Hide file tree

Showing 7 changed files with 324 additions and 324 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,6 @@
 #
 # Copyright 2011-2020 Free Software Foundation, Inc.
+# Copyright 2023 Magnus Lundmark <[email protected]>
 #
 # This file is part of VOLK
 #
@@ -249,6 +250,7 @@ install(FILES
     ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
     ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h

diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
@@ -0,0 +1,50 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2023 Magnus Lundmark <[email protected]>
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold AVX2 FMA intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
+#include <immintrin.h>
+
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
+{
+    const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
+    const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
+    const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
+    const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
+    const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
+    const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
+    const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
+
+    const __m256 x_times_x = _mm256_mul_ps(x, x);
+    __m256 arctan;
+    arctan = a13;
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
+    arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
+    arctan = _mm256_mul_ps(x, arctan);
+
+    return arctan;
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -16,6 +17,43 @@
 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
 #include <immintrin.h>
 
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m256 _m256_arctan_poly_avx(const __m256 x)
+{
+    const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
+    const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
+    const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
+    const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
+    const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
+    const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
+    const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
+
+    const __m256 x_times_x = _mm256_mul_ps(x, x);
+    __m256 arctan;
+    arctan = a13;
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a11);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a9);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a7);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a5);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a3);
+    arctan = _mm256_mul_ps(x_times_x, arctan);
+    arctan = _mm256_add_ps(arctan, a1);
+    arctan = _mm256_mul_ps(x, arctan);
+
+    return arctan;
+}
+
 static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
 {
     __m256 yl, yh, tmp1, tmp2;

diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f)
 // Constant used to do log10 calculations as faster log2
 ////////////////////////////////////////////////////////////////////////
 // precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
-#define volk_log2to10factor 3.01029995663981209120
+#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120
+
+////////////////////////////////////////////////////////////////////////
+// arctan(x)
+////////////////////////////////////////////////////////////////////////
+static inline float volk_arctan_poly(const float x)
+{
+    /*
+     * arctan(x) polynomial expansion on the interval [-1, 1]
+     * Maximum relative error < 6.6e-7
+     */
+    const float a1 = +0x1.ffffeap-1f;
+    const float a3 = -0x1.55437p-2f;
+    const float a5 = +0x1.972be6p-3f;
+    const float a7 = -0x1.1436ap-3f;
+    const float a9 = +0x1.5785aap-4f;
+    const float a11 = -0x1.2f3004p-5f;
+    const float a13 = +0x1.01a37cp-7f;
+
+    const float x_times_x = x * x;
+    float arctan = a13;
+    arctan = fmaf(x_times_x, arctan, a11);
+    arctan = fmaf(x_times_x, arctan, a9);
+    arctan = fmaf(x_times_x, arctan, a7);
+    arctan = fmaf(x_times_x, arctan, a5);
+    arctan = fmaf(x_times_x, arctan, a3);
+    arctan = fmaf(x_times_x, arctan, a1);
+    arctan *= x;
+
+    return arctan;
+}
+
+static inline float volk_arctan(const float x)
+{
+    /*
+     *  arctan(x) + arctan(1 / x) == sign(x) * pi / 2
+     */
+    const float pi_over_2 = 0x1.921fb6p0f;
+
+    if (fabs(x) < 1.f) {
+        return volk_arctan_poly(x);
+    } else {
+        return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x);
+    }
+}
 
 #endif /*INCLUDED_LIBVOLK_COMMON_H*/
diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h
@@ -1,6 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2015 Free Software Foundation, Inc.
+ * Copyright 2023 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -16,6 +17,43 @@
 #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
 #include <xmmintrin.h>
 
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m128 _mm_arctan_poly_sse(const __m128 x)
+{
+    const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
+    const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
+    const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
+    const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
+    const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
+    const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
+    const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
+
+    const __m128 x_times_x = _mm_mul_ps(x, x);
+    __m128 arctan;
+    arctan = a13;
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a11);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a9);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a7);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a5);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a3);
+    arctan = _mm_mul_ps(x_times_x, arctan);
+    arctan = _mm_add_ps(arctan, a1);
+    arctan = _mm_mul_ps(x, arctan);
+
+    return arctan;
+}
+
 static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
 {
     __m128 iValue, qValue;