Skip to content

Commit 2f56ec7

Browse files
committed
add RISC-V Vector extension (RVV) kernels
Signed-off-by: Olaf Bernstein <[email protected]>
1 parent bd4694d commit 2f56ec7

File tree

147 files changed

+5321
-89
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

147 files changed

+5321
-89
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#
2+
# Copyright 2020 - 2022 Free Software Foundation, Inc.
3+
#
4+
# This file is part of VOLK
5+
#
6+
# SPDX-License-Identifier: LGPL-3.0-or-later
7+
#
8+
9+
name: Run VOLK tests on different RVV configurations
10+
11+
on: [push, pull_request]
12+
13+
jobs:
14+
Tests:
15+
runs-on: ubuntu-24.04
16+
steps:
17+
- uses: actions/checkout@v4
18+
- name: Install packages
19+
run: |
20+
git submodule update --init --recursive
21+
sudo apt-get update -q -y
22+
sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18
23+
mkdir build
24+
cd build
25+
- name: Test gcc-14 VLEN=128
26+
run: |
27+
cd build; rm -rf *
28+
CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \
29+
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
30+
make -j$(nproc)
31+
ARGS=-j$(nproc) make test
32+
- name: Test gcc-14 VLEN=256
33+
run: |
34+
cd build; rm -rf *
35+
CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=256 \
36+
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
37+
make -j$(nproc)
38+
ARGS=-j$(nproc) make test
39+
- name: Test clang-18 VLEN=512
40+
run: |
41+
cd build; rm -rf *
42+
CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \
43+
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
44+
make -j$(nproc)
45+
ARGS=-j$(nproc) make test
46+
- name: Test clang-18 VLEN=1024
47+
run: |
48+
cd build; rm -rf *
49+
CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \
50+
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
51+
make -j$(nproc)
52+
ARGS=-j$(nproc) make test
53+
54+
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#if (__riscv_v_intrinsic >= 1000000 || __clang_major__ >= 18 || __GNUC__ >= 14)
2+
int main() { return 0; }
3+
#else
4+
#error "rvv intrinsics aren't supported"
5+
#endif
6+
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#
2+
# Copyright 2024 Free Software Foundation, Inc.
3+
#
4+
# This file is part of VOLK
5+
#
6+
# SPDX-License-Identifier: LGPL-3.0-or-later
7+
#
8+
9+
set(CMAKE_SYSTEM_NAME Linux)
10+
set(CMAKE_SYSTEM_PROCESSOR riscv64)
11+
12+
set(CMAKE_C_COMPILER $ENV{CC})
13+
set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
14+
set(CMAKE_CXX_COMPILER $ENV{CXX})
15+
16+
set(CMAKE_C_FLAGS "$ENV{CFLAGS} -march=rv64gcv" CACHE STRING "" FORCE)
17+
set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "" FORCE)
18+
set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g" CACHE STRING "" FORCE)
19+
20+
set(CMAKE_OBJCOPY
21+
${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy
22+
CACHE INTERNAL "objcopy tool")
23+
set(CMAKE_SIZE_UTIL
24+
${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size
25+
CACHE INTERNAL "size tool")
26+
27+
set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})
28+
29+
set(QEMU_VLEN $ENV{VLEN})
30+
if(NOT QEMU_VLEN)
31+
set(QEMU_VLEN "128")
32+
endif()
33+
34+
set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static -L /usr/riscv64-linux-gnu/ -cpu rv64,zba=true,zbb=true,v=on,vlen=${QEMU_VLEN},rvv_ta_all_1s=on,rvv_ma_all_1s=on")

gen/archs.xml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,4 +181,48 @@ at the top, as a last resort.
181181
<arch name="riscv64">
182182
</arch>
183183

184+
<!-->
185+
tmpl/ currently assumes that every arch.name starting with "rv" requires
186+
RVV intrinsics
187+
</-->
188+
<!-->
189+
There is currently no mechanism in RISC-V to append extensions,
190+
so each arch needs to specify all of them, and the order needs in the
191+
machine definition needs to be from the fewest to the most extensions.
192+
Fortunately, this maps quite well to the profiles concept.
193+
</-->
194+
<arch name="rvv">
195+
<check name="V"></check>
196+
<flag compiler="gnu">-march=rv64gcv</flag>
197+
<flag compiler="clang">-march=rv64gcv</flag>
198+
</arch>
199+
200+
<arch name="rvvseg">
201+
<check name="V"></check>
202+
<flag compiler="gnu">-march=rv64gcv</flag>
203+
<flag compiler="clang">-march=rv64gcv</flag>
204+
<!-->
205+
It's unclear how performance portable segmented load/stores are, so the
206+
default rvv implementations avoid using them.
207+
This is a pseudo arch for separate segmented load/store implementations,
208+
and is expected to never be used standalone without "rvv".
209+
</-->
210+
</arch>
211+
212+
<!-->
213+
google/cpu_features currently doesn't support these extensions and profiles.
214+
</-->
215+
<!--arch name="rva22v">
216+
<check name="V"></check>
217+
<check name="B"></check>
218+
<flag compiler="gnu">-march=rv64gcv_zba_zbb_zbs</flag>
219+
<flag compiler="clang">-march=rv64gcv_zba_zbb_zbs</flag>
220+
</arch-->
221+
222+
<!--arch name="rva23">
223+
<check name="rva23"></check>
224+
<flag compiler="gnu">-march=rva23u64</flag>
225+
<flag compiler="clang">-march=rva23u64</flag>
226+
</arch-->
227+
184228
</grammar>

gen/machines.xml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,18 @@
3333
<archs>generic riscv64 orc|</archs>
3434
</machine>
3535

36+
<machine name="rv64gcv">
37+
<archs>generic riscv64 rvv rvvseg orc|</archs>
38+
</machine>
39+
40+
<!--machine name="rva22v">
41+
<archs>generic riscv64 rvv rvvseg rva22v orc|</archs>
42+
</machine-->
43+
44+
<!--machine name="rva23">
45+
<archs>generic riscv64 rvv rvvseg rva22v rva23 orc|</archs>
46+
</machine-->
47+
3648
<machine name="sse4_a">
3749
<archs>generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc|</archs>
3850
</machine>

include/volk/volk_rvv_intrinsics.h

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/* -*- c++ -*- */
2+
/*
3+
* Copyright 2024 Free Software Foundation, Inc.
4+
*
5+
* This file is part of VOLK
6+
*
7+
* SPDX-License-Identifier: LGPL-3.0-or-later
8+
*/
9+
10+
/*
11+
* This file is intended to hold RVV intrinsics of intrinsics.
12+
* They should be used in VOLK kernels to avoid copy-paste.
13+
*/
14+
15+
#ifndef INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
16+
#define INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
17+
#include <riscv_vector.h>
18+
19+
#define RISCV_SHRINK2(op,T,S,v) __riscv_##op( \
20+
__riscv_vget_##T##S##m1(v, 0), \
21+
__riscv_vget_##T##S##m1(v, 1), \
22+
__riscv_vsetvlmax_e##S##m1())
23+
24+
#define RISCV_SHRINK4(op,T,S,v) __riscv_##op( \
25+
__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
26+
__riscv_vget_##T##S##m1(v, 1), \
27+
__riscv_vsetvlmax_e##S##m1()), \
28+
__riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
29+
__riscv_vget_##T##S##m1(v, 3), \
30+
__riscv_vsetvlmax_e##S##m1()), \
31+
__riscv_vsetvlmax_e##S##m1())
32+
33+
#define RISCV_SHRINK8(op,T,S,v) __riscv_##op( \
34+
__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
35+
__riscv_vget_##T##S##m1(v, 1), \
36+
__riscv_vsetvlmax_e##S##m1()), \
37+
__riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
38+
__riscv_vget_##T##S##m1(v, 3), \
39+
__riscv_vsetvlmax_e##S##m1()), \
40+
__riscv_vsetvlmax_e##S##m1()), \
41+
__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 4), \
42+
__riscv_vget_##T##S##m1(v, 5), \
43+
__riscv_vsetvlmax_e##S##m1()), \
44+
__riscv_##op(__riscv_vget_##T##S##m1(v, 6), \
45+
__riscv_vget_##T##S##m1(v, 7), \
46+
__riscv_vsetvlmax_e##S##m1()), \
47+
__riscv_vsetvlmax_e##S##m1()), \
48+
__riscv_vsetvlmax_e##S##m1())
49+
50+
#define RISCV_PERM4(f,v,vidx) __riscv_vcreate_v_u8m1_u8m4( \
51+
f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
52+
f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
53+
f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
54+
f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()))
55+
56+
#define RISCV_LUT4(f,vtbl,v) __riscv_vcreate_v_u8m1_u8m4( \
57+
f(vtbl, __riscv_vget_u8m1(v, 0), __riscv_vsetvlmax_e8m1()), \
58+
f(vtbl, __riscv_vget_u8m1(v, 1), __riscv_vsetvlmax_e8m1()), \
59+
f(vtbl, __riscv_vget_u8m1(v, 2), __riscv_vsetvlmax_e8m1()), \
60+
f(vtbl, __riscv_vget_u8m1(v, 3), __riscv_vsetvlmax_e8m1()))
61+
62+
#define RISCV_PERM8(f,v,vidx) __riscv_vcreate_v_u8m1_u8m8( \
63+
f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
64+
f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
65+
f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
66+
f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()), \
67+
f(__riscv_vget_u8m1(v, 4), vidx, __riscv_vsetvlmax_e8m1()), \
68+
f(__riscv_vget_u8m1(v, 5), vidx, __riscv_vsetvlmax_e8m1()), \
69+
f(__riscv_vget_u8m1(v, 6), vidx, __riscv_vsetvlmax_e8m1()), \
70+
f(__riscv_vget_u8m1(v, 7), vidx, __riscv_vsetvlmax_e8m1()))
71+
72+
#define RISCV_VMFLTZ(T,v,vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl)
73+
74+
#endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */

kernels/volk/volk_16i_32fc_dot_prod_32fc.h

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,5 +668,68 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
668668

669669
#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
670670

671+
#ifdef LV_HAVE_RVV
672+
#include <riscv_vector.h>
673+
#include <volk/volk_rvv_intrinsics.h>
674+
675+
static inline void
676+
volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result,
677+
const short* input,
678+
const lv_32fc_t* taps,
679+
unsigned int num_points)
680+
{
681+
vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
682+
vfloat32m4_t vsumi = vsumr;
683+
size_t n = num_points;
684+
for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
685+
vl = __riscv_vsetvl_e32m4(n);
686+
vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl);
687+
vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
688+
vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
689+
vfloat32m4_t v = __riscv_vfwcvt_f(
690+
__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
691+
vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
692+
vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
693+
}
694+
size_t vl =__riscv_vsetvlmax_e32m1();
695+
vfloat32m1_t vr = RISCV_SHRINK4(vfadd,f,32,vsumr);
696+
vfloat32m1_t vi = RISCV_SHRINK4(vfadd,f,32,vsumi);
697+
vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
698+
*result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
699+
__riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
700+
}
701+
#endif /*LV_HAVE_RVV*/
702+
703+
#ifdef LV_HAVE_RVVSEG
704+
#include <riscv_vector.h>
705+
#include <volk/volk_rvv_intrinsics.h>
706+
707+
static inline void
708+
volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result,
709+
const short* input,
710+
const lv_32fc_t* taps,
711+
unsigned int num_points)
712+
{
713+
vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
714+
vfloat32m4_t vsumi = vsumr;
715+
size_t n = num_points;
716+
for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
717+
vl = __riscv_vsetvl_e32m4(n);
718+
vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
719+
vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
720+
vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
721+
vfloat32m4_t v = __riscv_vfwcvt_f(
722+
__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
723+
vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
724+
vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
725+
}
726+
size_t vl =__riscv_vsetvlmax_e32m1();
727+
vfloat32m1_t vr = RISCV_SHRINK4(vfadd,f,32,vsumr);
728+
vfloat32m1_t vi = RISCV_SHRINK4(vfadd,f,32,vsumi);
729+
vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
730+
*result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
731+
__riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
732+
}
733+
#endif /*LV_HAVE_RVVSEG*/
671734

672735
#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/

kernels/volk/volk_16i_branch_4_state_8.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
/*!
1111
* \page volk_16i_branch_4_state_8
1212
*
13+
* \b Deprecation
14+
*
15+
* This kernel is deprecated.
16+
*
1317
* \b Overview
1418
*
1519
* <FIXME>

kernels/volk/volk_16i_convert_8i.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,5 +275,21 @@ static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
275275
}
276276
#endif /* LV_HAVE_NEON */
277277

278+
#ifdef LV_HAVE_RVV
279+
#include <riscv_vector.h>
280+
281+
static inline void
282+
volk_16i_convert_8i_rvv(int8_t* outputVector,
283+
const int16_t* inputVector,
284+
unsigned int num_points)
285+
{
286+
size_t n = num_points;
287+
for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
288+
vl = __riscv_vsetvl_e16m8(n);
289+
vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl);
290+
__riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl);
291+
}
292+
}
293+
#endif /*LV_HAVE_RVV*/
278294

279295
#endif /* INCLUDED_volk_16i_convert_8i_a_H */

kernels/volk/volk_16i_max_star_16i.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
/*!
1111
* \page volk_16i_max_star_16i
1212
*
13+
* \b Deprecation
14+
*
15+
* This kernel is deprecated.
16+
*
1317
* \b Overview
1418
*
1519
* <FIXME>

0 commit comments

Comments
 (0)