Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arctan avx512 #759

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ install(
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
Expand Down
8 changes: 8 additions & 0 deletions gen/archs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ at the top, as a last resort.
<alignment>64</alignment>
</arch>

<arch name="avx512dq">
<check name="avx512dq"></check>
<flag compiler="gnu">-mavx512dq</flag>
<flag compiler="clang">-mavx512dq</flag>
<flag compiler="msvc">/arch:AVX512DQ</flag>
<alignment>64</alignment>
</arch>

<arch name="riscv64">
</arch>

Expand Down
5 changes: 5 additions & 0 deletions gen/machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,9 @@
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc|</archs>
</machine>

<!-- trailing | bar means generate without either for MSVC -->
<machine name="avx512dq">
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc|</archs>
</machine>

</grammar>
4 changes: 2 additions & 2 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
*/

/*
* This file is intended to hold AVX2 FMA intrinsics of intrinsics.
* This file is intended to hold AVX2 FMA intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

Expand All @@ -23,7 +23,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
Expand Down
71 changes: 71 additions & 0 deletions include/volk/volk_avx512_intrinsics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/* -*- c++ -*- */
/*
* Copyright 2024 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*
* This file is intended to hold AVX512 intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#include <immintrin.h>

////////////////////////////////////////////////////////////////////////
// Place real parts of two complex vectors in output
// Requires AVX512F
////////////////////////////////////////////////////////////////////////
static inline __m512 _mm512_real(const __m512 z1, const __m512 z2)
{
const __m512i idx =
_mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
return _mm512_permutex2var_ps(z1, idx, z2);
}

////////////////////////////////////////////////////////////////////////
// Place imaginary parts of two complex vectors in output
// Requires AVX512F
////////////////////////////////////////////////////////////////////////
static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2)
{
const __m512i idx =
_mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
return _mm512_permutex2var_ps(z1, idx, z2);
}

////////////////////////////////////////////////////////////////////////
// Approximate arctan(x) via polynomial expansion on the interval [-1, 1]
// Maximum relative error ~6.5e-7
// Polynomial evaluated via Horner's method
// Requires AVX512F
////////////////////////////////////////////////////////////////////////
static inline __m512 _mm512_arctan_poly_avx512(const __m512 x)
{
const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);
const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f);
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f);
const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f);
const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f);
const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f);
const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f);

const __m512 x_times_x = _mm512_mul_ps(x, x);
__m512 arctan;
arctan = a13;
arctan = _mm512_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm512_mul_ps(x, arctan);

return arctan;
}

#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
4 changes: 2 additions & 2 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
*/

/*
* This file is intended to hold AVX intrinsics of intrinsics.
* This file is intended to hold AVX intrinsics.
* They should be used in VOLK kernels to avoid copy-pasta.
*/

Expand All @@ -24,7 +24,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx(const __m256 x)
static inline __m256 _mm256_arctan_poly_avx(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
Expand Down
Loading
Loading