Skip to content

Commit 5aaac67

Browse files
authored
Merge pull request #774 from camel-cdr/main
Implement RISC-V Vector 1.0 kernels
2 parents 7fdde55 + 8e88f1e commit 5aaac67

File tree

147 files changed

+5396
-116
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

147 files changed

+5396
-116
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#
2+
# Copyright 2020 - 2022 Free Software Foundation, Inc.
3+
#
4+
# This file is part of VOLK
5+
#
6+
# SPDX-License-Identifier: LGPL-3.0-or-later
7+
#
8+
9+
name: Run VOLK tests on different RVV configurations
10+
11+
on: [push, pull_request]
12+
13+
jobs:
14+
Tests:
15+
runs-on: ubuntu-24.04
16+
steps:
17+
- uses: actions/checkout@v4
18+
with:
19+
submodules: "recursive"
20+
- name: Install packages
21+
run: |
22+
sudo apt-get update -q -y
23+
sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18
24+
mkdir build
25+
cd build
26+
- name: Test gcc-14 VLEN=128
27+
run: |
28+
cd build; rm -rf *
29+
CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \
30+
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
31+
make -j$(nproc)
32+
ARGS=-V make test
33+
- name: Test gcc-14 VLEN=256
34+
run: |
35+
cd build; rm -rf *
36+
CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=256 \
37+
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
38+
make -j$(nproc)
39+
ARGS=-V make test
40+
- name: Test clang-18 VLEN=512
41+
run: |
42+
cd build; rm -rf *
43+
CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \
44+
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
45+
make -j$(nproc)
46+
ARGS=-V make test
47+
- name: Test clang-18 VLEN=1024
48+
run: |
49+
cd build; rm -rf *
50+
CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \
51+
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
52+
make -j$(nproc)
53+
ARGS=-V make test
54+
55+
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#if (__riscv_v_intrinsic >= 1000000 || __clang_major__ >= 18 || __GNUC__ >= 14)
2+
int main() { return 0; }
3+
#else
4+
#error "rvv intrinsics aren't supported"
5+
#endif
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#
2+
# Copyright 2024 Free Software Foundation, Inc.
3+
#
4+
# This file is part of VOLK
5+
#
6+
# SPDX-License-Identifier: LGPL-3.0-or-later
7+
#
8+
9+
set(CMAKE_SYSTEM_NAME Linux)
10+
set(CMAKE_SYSTEM_PROCESSOR riscv64)
11+
12+
set(CMAKE_C_COMPILER $ENV{CC})
13+
set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
14+
set(CMAKE_CXX_COMPILER $ENV{CXX})
15+
16+
set(CMAKE_C_FLAGS "$ENV{CFLAGS} -march=rv64gcv" CACHE STRING "" FORCE)
17+
set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "" FORCE)
18+
set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g" CACHE STRING "" FORCE)
19+
20+
set(CMAKE_OBJCOPY
21+
${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy
22+
CACHE INTERNAL "objcopy tool")
23+
set(CMAKE_SIZE_UTIL
24+
${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size
25+
CACHE INTERNAL "size tool")
26+
27+
set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})
28+
29+
set(QEMU_VLEN $ENV{VLEN})
30+
if(NOT QEMU_VLEN)
31+
set(QEMU_VLEN "128")
32+
endif()
33+
34+
set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static -L /usr/riscv64-linux-gnu/ -cpu rv64,zba=true,zbb=true,v=on,vlen=${QEMU_VLEN},rvv_ta_all_1s=on,rvv_ma_all_1s=on")

gen/archs.xml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,4 +181,48 @@ at the top, as a last resort.
181181
<arch name="riscv64">
182182
</arch>
183183

184+
<!-->
185+
tmpl/ currently assumes that every arch.name starting with "rv" requires
186+
RVV intrinsics
187+
</-->
188+
<!-->
189+
There is currently no mechanism in RISC-V to append extensions,
190+
so each arch needs to specify all of them, and the order needs in the
191+
machine definition needs to be from the fewest to the most extensions.
192+
Fortunately, this maps quite well to the profiles concept.
193+
</-->
194+
<arch name="rvv">
195+
<check name="V"></check>
196+
<flag compiler="gnu">-march=rv64gcv</flag>
197+
<flag compiler="clang">-march=rv64gcv</flag>
198+
</arch>
199+
200+
<arch name="rvvseg">
201+
<check name="V"></check>
202+
<flag compiler="gnu">-march=rv64gcv</flag>
203+
<flag compiler="clang">-march=rv64gcv</flag>
204+
<!-->
205+
It's unclear how performance portable segmented load/stores are, so the
206+
default rvv implementations avoid using them.
207+
This is a pseudo arch for separate segmented load/store implementations,
208+
and is expected to never be used standalone without "rvv".
209+
</-->
210+
</arch>
211+
212+
<!-->
213+
google/cpu_features currently doesn't support these extensions and profiles.
214+
</-->
215+
<!--arch name="rva22v">
216+
<check name="V"></check>
217+
<check name="B"></check>
218+
<flag compiler="gnu">-march=rv64gcv_zba_zbb_zbs</flag>
219+
<flag compiler="clang">-march=rv64gcv_zba_zbb_zbs</flag>
220+
</arch-->
221+
222+
<!--arch name="rva23">
223+
<check name="rva23"></check>
224+
<flag compiler="gnu">-march=rva23u64</flag>
225+
<flag compiler="clang">-march=rva23u64</flag>
226+
</arch-->
227+
184228
</grammar>

gen/machines.xml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,18 @@
3333
<archs>generic riscv64 orc|</archs>
3434
</machine>
3535

36+
<machine name="rv64gcv">
37+
<archs>generic riscv64 rvv rvvseg orc|</archs>
38+
</machine>
39+
40+
<!--machine name="rva22v">
41+
<archs>generic riscv64 rvv rvvseg rva22v orc|</archs>
42+
</machine-->
43+
44+
<!--machine name="rva23">
45+
<archs>generic riscv64 rvv rvvseg rva22v rva23 orc|</archs>
46+
</machine-->
47+
3648
<machine name="sse4_a">
3749
<archs>generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc|</archs>
3850
</machine>

include/volk/volk_rvv_intrinsics.h

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/* -*- c++ -*- */
2+
/*
3+
* Copyright 2024 Free Software Foundation, Inc.
4+
*
5+
* This file is part of VOLK
6+
*
7+
* SPDX-License-Identifier: LGPL-3.0-or-later
8+
*/
9+
10+
/*
11+
* This file is intended to hold RVV intrinsics of intrinsics.
12+
* They should be used in VOLK kernels to avoid copy-paste.
13+
*/
14+
15+
#ifndef INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
16+
#define INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
17+
#include <riscv_vector.h>
18+
19+
#define RISCV_SHRINK2(op, T, S, v) \
20+
__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
21+
__riscv_vget_##T##S##m1(v, 1), \
22+
__riscv_vsetvlmax_e##S##m1())
23+
24+
#define RISCV_SHRINK4(op, T, S, v) \
25+
__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
26+
__riscv_vget_##T##S##m1(v, 1), \
27+
__riscv_vsetvlmax_e##S##m1()), \
28+
__riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
29+
__riscv_vget_##T##S##m1(v, 3), \
30+
__riscv_vsetvlmax_e##S##m1()), \
31+
__riscv_vsetvlmax_e##S##m1())
32+
33+
#define RISCV_SHRINK8(op, T, S, v) \
34+
__riscv_##op(__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
35+
__riscv_vget_##T##S##m1(v, 1), \
36+
__riscv_vsetvlmax_e##S##m1()), \
37+
__riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
38+
__riscv_vget_##T##S##m1(v, 3), \
39+
__riscv_vsetvlmax_e##S##m1()), \
40+
__riscv_vsetvlmax_e##S##m1()), \
41+
__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 4), \
42+
__riscv_vget_##T##S##m1(v, 5), \
43+
__riscv_vsetvlmax_e##S##m1()), \
44+
__riscv_##op(__riscv_vget_##T##S##m1(v, 6), \
45+
__riscv_vget_##T##S##m1(v, 7), \
46+
__riscv_vsetvlmax_e##S##m1()), \
47+
__riscv_vsetvlmax_e##S##m1()), \
48+
__riscv_vsetvlmax_e##S##m1())
49+
50+
#define RISCV_PERM4(f, v, vidx) \
51+
__riscv_vcreate_v_u8m1_u8m4( \
52+
f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
53+
f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
54+
f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
55+
f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()))
56+
57+
#define RISCV_LUT4(f, vtbl, v) \
58+
__riscv_vcreate_v_u8m1_u8m4( \
59+
f(vtbl, __riscv_vget_u8m1(v, 0), __riscv_vsetvlmax_e8m1()), \
60+
f(vtbl, __riscv_vget_u8m1(v, 1), __riscv_vsetvlmax_e8m1()), \
61+
f(vtbl, __riscv_vget_u8m1(v, 2), __riscv_vsetvlmax_e8m1()), \
62+
f(vtbl, __riscv_vget_u8m1(v, 3), __riscv_vsetvlmax_e8m1()))
63+
64+
#define RISCV_PERM8(f, v, vidx) \
65+
__riscv_vcreate_v_u8m1_u8m8( \
66+
f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
67+
f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
68+
f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
69+
f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()), \
70+
f(__riscv_vget_u8m1(v, 4), vidx, __riscv_vsetvlmax_e8m1()), \
71+
f(__riscv_vget_u8m1(v, 5), vidx, __riscv_vsetvlmax_e8m1()), \
72+
f(__riscv_vget_u8m1(v, 6), vidx, __riscv_vsetvlmax_e8m1()), \
73+
f(__riscv_vget_u8m1(v, 7), vidx, __riscv_vsetvlmax_e8m1()))
74+
75+
#define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl)
76+
77+
#endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */

kernels/volk/volk_16i_32fc_dot_prod_32fc.h

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,5 +668,66 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
668668

669669
#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
670670

671+
#ifdef LV_HAVE_RVV
672+
#include <riscv_vector.h>
673+
#include <volk/volk_rvv_intrinsics.h>
674+
675+
static inline void volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result,
676+
const short* input,
677+
const lv_32fc_t* taps,
678+
unsigned int num_points)
679+
{
680+
vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
681+
vfloat32m4_t vsumi = vsumr;
682+
size_t n = num_points;
683+
for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
684+
vl = __riscv_vsetvl_e32m4(n);
685+
vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl);
686+
vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
687+
vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
688+
vfloat32m4_t v =
689+
__riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
690+
vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
691+
vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
692+
}
693+
size_t vl = __riscv_vsetvlmax_e32m1();
694+
vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
695+
vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
696+
vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
697+
*result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
698+
__riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
699+
}
700+
#endif /*LV_HAVE_RVV*/
701+
702+
#ifdef LV_HAVE_RVVSEG
703+
#include <riscv_vector.h>
704+
#include <volk/volk_rvv_intrinsics.h>
705+
706+
static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result,
707+
const short* input,
708+
const lv_32fc_t* taps,
709+
unsigned int num_points)
710+
{
711+
vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
712+
vfloat32m4_t vsumi = vsumr;
713+
size_t n = num_points;
714+
for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
715+
vl = __riscv_vsetvl_e32m4(n);
716+
vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
717+
vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
718+
vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
719+
vfloat32m4_t v =
720+
__riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
721+
vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
722+
vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
723+
}
724+
size_t vl = __riscv_vsetvlmax_e32m1();
725+
vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
726+
vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
727+
vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
728+
*result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
729+
__riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
730+
}
731+
#endif /*LV_HAVE_RVVSEG*/
671732

672733
#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/

kernels/volk/volk_16i_branch_4_state_8.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
/*!
1111
* \page volk_16i_branch_4_state_8
1212
*
13+
* \b Deprecation
14+
*
15+
* This kernel is deprecated.
16+
*
1317
* \b Overview
1418
*
1519
* <FIXME>

kernels/volk/volk_16i_convert_8i.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,5 +275,20 @@ static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
275275
}
276276
#endif /* LV_HAVE_NEON */
277277

278+
#ifdef LV_HAVE_RVV
279+
#include <riscv_vector.h>
280+
281+
static inline void volk_16i_convert_8i_rvv(int8_t* outputVector,
282+
const int16_t* inputVector,
283+
unsigned int num_points)
284+
{
285+
size_t n = num_points;
286+
for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
287+
vl = __riscv_vsetvl_e16m8(n);
288+
vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl);
289+
__riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl);
290+
}
291+
}
292+
#endif /*LV_HAVE_RVV*/
278293

279294
#endif /* INCLUDED_volk_16i_convert_8i_a_H */

kernels/volk/volk_16i_max_star_16i.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
/*!
1111
* \page volk_16i_max_star_16i
1212
*
13+
* \b Deprecation
14+
*
15+
* This kernel is deprecated.
16+
*
1317
* \b Overview
1418
*
1519
* <FIXME>

0 commit comments

Comments
 (0)