Skip to content

Commit a26a1b8

Browse files
authored
Merge pull request #633 from Ka-zam/arctan
New kernels for arctan
2 parents 74b6c6a + 0343e3c commit a26a1b8

File tree

7 files changed

+324
-324
lines changed

7 files changed

+324
-324
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#
22
# Copyright 2011-2020 Free Software Foundation, Inc.
3+
# Copyright 2023 Magnus Lundmark <[email protected]>
34
#
45
# This file is part of VOLK
56
#
@@ -249,6 +250,7 @@ install(FILES
249250
${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
250251
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
251252
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
253+
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
252254
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
253255
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
254256
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/* -*- c++ -*- */
2+
/*
3+
* Copyright 2023 Magnus Lundmark <[email protected]>
4+
*
5+
* This file is part of VOLK
6+
*
7+
* SPDX-License-Identifier: LGPL-3.0-or-later
8+
*/
9+
10+
/*
11+
* This file is intended to hold AVX2 FMA intrinsics of intrinsics.
12+
* They should be used in VOLK kernels to avoid copy-paste.
13+
*/
14+
15+
#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
16+
#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
17+
#include <immintrin.h>
18+
19+
/*
20+
* Approximate arctan(x) via polynomial expansion
21+
* on the interval [-1, 1]
22+
*
23+
* Maximum relative error ~6.5e-7
24+
* Polynomial evaluated via Horner's method
25+
*/
26+
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
27+
{
28+
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
29+
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
30+
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
31+
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
32+
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
33+
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
34+
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
35+
36+
const __m256 x_times_x = _mm256_mul_ps(x, x);
37+
__m256 arctan;
38+
arctan = a13;
39+
arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
40+
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
41+
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
42+
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
43+
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
44+
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
45+
arctan = _mm256_mul_ps(x, arctan);
46+
47+
return arctan;
48+
}
49+
50+
#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */

include/volk/volk_avx_intrinsics.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* -*- c++ -*- */
22
/*
33
* Copyright 2015 Free Software Foundation, Inc.
4+
* Copyright 2023 Magnus Lundmark <[email protected]>
45
*
56
* This file is part of VOLK
67
*
@@ -16,6 +17,43 @@
1617
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
1718
#include <immintrin.h>
1819

20+
/*
21+
* Approximate arctan(x) via polynomial expansion
22+
* on the interval [-1, 1]
23+
*
24+
* Maximum relative error ~6.5e-7
25+
* Polynomial evaluated via Horner's method
26+
*/
27+
static inline __m256 _m256_arctan_poly_avx(const __m256 x)
28+
{
29+
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
30+
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
31+
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
32+
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
33+
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
34+
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
35+
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
36+
37+
const __m256 x_times_x = _mm256_mul_ps(x, x);
38+
__m256 arctan;
39+
arctan = a13;
40+
arctan = _mm256_mul_ps(x_times_x, arctan);
41+
arctan = _mm256_add_ps(arctan, a11);
42+
arctan = _mm256_mul_ps(x_times_x, arctan);
43+
arctan = _mm256_add_ps(arctan, a9);
44+
arctan = _mm256_mul_ps(x_times_x, arctan);
45+
arctan = _mm256_add_ps(arctan, a7);
46+
arctan = _mm256_mul_ps(x_times_x, arctan);
47+
arctan = _mm256_add_ps(arctan, a5);
48+
arctan = _mm256_mul_ps(x_times_x, arctan);
49+
arctan = _mm256_add_ps(arctan, a3);
50+
arctan = _mm256_mul_ps(x_times_x, arctan);
51+
arctan = _mm256_add_ps(arctan, a1);
52+
arctan = _mm256_mul_ps(x, arctan);
53+
54+
return arctan;
55+
}
56+
1957
static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
2058
{
2159
__m256 yl, yh, tmp1, tmp2;

include/volk/volk_common.h

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* -*- c++ -*- */
22
/*
33
* Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
4+
* Copyright 2023 Magnus Lundmark <[email protected]>
45
*
56
* This file is part of VOLK
67
*
@@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f)
166167
// Constant used to do log10 calculations as faster log2
167168
////////////////////////////////////////////////////////////////////////
168169
// precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
169-
#define volk_log2to10factor 3.01029995663981209120
170+
#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120
171+
172+
////////////////////////////////////////////////////////////////////////
173+
// arctan(x)
174+
////////////////////////////////////////////////////////////////////////
175+
static inline float volk_arctan_poly(const float x)
176+
{
177+
/*
178+
* arctan(x) polynomial expansion on the interval [-1, 1]
179+
* Maximum relative error < 6.6e-7
180+
*/
181+
const float a1 = +0x1.ffffeap-1f;
182+
const float a3 = -0x1.55437p-2f;
183+
const float a5 = +0x1.972be6p-3f;
184+
const float a7 = -0x1.1436ap-3f;
185+
const float a9 = +0x1.5785aap-4f;
186+
const float a11 = -0x1.2f3004p-5f;
187+
const float a13 = +0x1.01a37cp-7f;
188+
189+
const float x_times_x = x * x;
190+
float arctan = a13;
191+
arctan = fmaf(x_times_x, arctan, a11);
192+
arctan = fmaf(x_times_x, arctan, a9);
193+
arctan = fmaf(x_times_x, arctan, a7);
194+
arctan = fmaf(x_times_x, arctan, a5);
195+
arctan = fmaf(x_times_x, arctan, a3);
196+
arctan = fmaf(x_times_x, arctan, a1);
197+
arctan *= x;
198+
199+
return arctan;
200+
}
201+
202+
static inline float volk_arctan(const float x)
203+
{
204+
/*
205+
* arctan(x) + arctan(1 / x) == sign(x) * pi / 2
206+
*/
207+
const float pi_over_2 = 0x1.921fb6p0f;
208+
209+
if (fabs(x) < 1.f) {
210+
return volk_arctan_poly(x);
211+
} else {
212+
return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x);
213+
}
214+
}
170215

171216
#endif /*INCLUDED_LIBVOLK_COMMON_H*/

include/volk/volk_sse_intrinsics.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* -*- c++ -*- */
22
/*
33
* Copyright 2015 Free Software Foundation, Inc.
4+
* Copyright 2023 Magnus Lundmark <[email protected]>
45
*
56
* This file is part of VOLK
67
*
@@ -16,6 +17,43 @@
1617
#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
1718
#include <xmmintrin.h>
1819

20+
/*
21+
* Approximate arctan(x) via polynomial expansion
22+
* on the interval [-1, 1]
23+
*
24+
* Maximum relative error ~6.5e-7
25+
* Polynomial evaluated via Horner's method
26+
*/
27+
static inline __m128 _mm_arctan_poly_sse(const __m128 x)
28+
{
29+
const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
30+
const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
31+
const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
32+
const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
33+
const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
34+
const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
35+
const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
36+
37+
const __m128 x_times_x = _mm_mul_ps(x, x);
38+
__m128 arctan;
39+
arctan = a13;
40+
arctan = _mm_mul_ps(x_times_x, arctan);
41+
arctan = _mm_add_ps(arctan, a11);
42+
arctan = _mm_mul_ps(x_times_x, arctan);
43+
arctan = _mm_add_ps(arctan, a9);
44+
arctan = _mm_mul_ps(x_times_x, arctan);
45+
arctan = _mm_add_ps(arctan, a7);
46+
arctan = _mm_mul_ps(x_times_x, arctan);
47+
arctan = _mm_add_ps(arctan, a5);
48+
arctan = _mm_mul_ps(x_times_x, arctan);
49+
arctan = _mm_add_ps(arctan, a3);
50+
arctan = _mm_mul_ps(x_times_x, arctan);
51+
arctan = _mm_add_ps(arctan, a1);
52+
arctan = _mm_mul_ps(x, arctan);
53+
54+
return arctan;
55+
}
56+
1957
static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
2058
{
2159
__m128 iValue, qValue;

0 commit comments

Comments
 (0)