Skip to content

Commit 2389090

Browse files
authored
x86 optimization for convolution int8 gemm (#5874)
* cmake check compiler test cannot be optimized out * drop requant pack4
1 parent 4a70be4 commit 2389090

18 files changed

+4392
-7091
lines changed

.github/workflows/test-coverage.yml

+67
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,73 @@ jobs:
121121
plugins: noop
122122
files: build/lcov.info
123123

124+
linux-gcc-x64-sde-combined:
125+
runs-on: ubuntu-24.04
126+
steps:
127+
- uses: actions/checkout@v4
128+
- name: update
129+
run: sudo apt-get update
130+
- name: gcc14
131+
run: sudo apt-get install gcc-14 g++-14
132+
- name: lcov
133+
run: sudo apt-get install lcov
134+
- name: Setup SDE binaries
135+
uses: petarpetrovt/[email protected]
136+
- name: build
137+
env:
138+
CC: gcc-14
139+
CXX: g++-14
140+
run: |
141+
mkdir build && cd build
142+
cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
143+
cmake --build . -j $(nproc)
144+
- name: test-p4p
145+
run: |
146+
cd build
147+
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-p4p;--" ctest --output-on-failure -j $(nproc)
148+
- name: test-snb
149+
run: |
150+
cd build
151+
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-snb;--" ctest --output-on-failure -j $(nproc)
152+
- name: test-hsw
153+
run: |
154+
cd build
155+
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-hsw;--" ctest --output-on-failure -j $(nproc)
156+
- name: test-adl
157+
run: |
158+
cd build
159+
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-adl;--" ctest --output-on-failure -j $(nproc)
160+
- name: test-arl
161+
run: |
162+
cd build
163+
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-arl;--" ctest --output-on-failure -j $(nproc)
164+
- name: test-skx
165+
run: |
166+
cd build
167+
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-skx;--" ctest --output-on-failure -j $(nproc)
168+
- name: test-spr
169+
run: |
170+
cd build
171+
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j $(nproc)
172+
- name: test-gnr
173+
run: |
174+
cd build
175+
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-gnr;--" ctest --output-on-failure -j $(nproc)
176+
- name: lcov-collect
177+
run: |
178+
cd build
179+
lcov --gcov-tool gcov-14 -d ./src -c -o lcov.info
180+
lcov -r lcov.info '/usr/*' -o lcov.info
181+
lcov -r lcov.info '*/build/*' -o lcov.info
182+
lcov --list lcov.info
183+
- name: codecov
184+
uses: codecov/codecov-action@v5
185+
with:
186+
token: ${{ secrets.CODECOV_TOKEN }}
187+
disable_search: true
188+
plugins: noop
189+
files: build/lcov.info
190+
124191
linux-gcc-riscv64-rvv:
125192
strategy:
126193
matrix:

src/layer/arm/requantize_arm.cpp

-45
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,6 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
5757
_scale_in0 = vld1q_f32((const float*)scale_in_data);
5858
_scale_in1 = vld1q_f32((const float*)scale_in_data + 4);
5959
}
60-
if (elempack == 4)
61-
{
62-
_scale_in0 = vld1q_f32((const float*)scale_in_data);
63-
_scale_in1 = _scale_in0;
64-
}
6560
}
6661
#endif // __ARM_NEON
6762

@@ -76,11 +71,6 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
7671
_scale_out0 = vld1q_f32((const float*)scale_out_data);
7772
_scale_out1 = vld1q_f32((const float*)scale_out_data + 4);
7873
}
79-
if (elempack == 4)
80-
{
81-
_scale_out0 = vld1q_f32((const float*)scale_out_data);
82-
_scale_out1 = _scale_out0;
83-
}
8474
}
8575
#endif // __ARM_NEON
8676

@@ -139,11 +129,6 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
139129
_bias0 = vld1q_f32((const float*)bias_data);
140130
_bias1 = vld1q_f32((const float*)bias_data + 4);
141131
}
142-
if (elempack == 4)
143-
{
144-
_bias0 = vld1q_f32((const float*)bias_data);
145-
_bias1 = _bias0;
146-
}
147132
}
148133
#endif // __ARM_NEON
149134

@@ -224,11 +209,6 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
224209
_scale_in0 = vld1q_f32((const float*)scale_in_data);
225210
_scale_in1 = vld1q_f32((const float*)scale_in_data + 4);
226211
}
227-
if (elempack == 4)
228-
{
229-
_scale_in0 = vld1q_f32((const float*)scale_in_data);
230-
_scale_in1 = _scale_in0;
231-
}
232212
}
233213
#endif // __ARM_NEON
234214

@@ -243,11 +223,6 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
243223
_scale_out0 = vld1q_f32((const float*)scale_out_data);
244224
_scale_out1 = vld1q_f32((const float*)scale_out_data + 4);
245225
}
246-
if (elempack == 4)
247-
{
248-
_scale_out0 = vld1q_f32((const float*)scale_out_data);
249-
_scale_out1 = _scale_out0;
250-
}
251226
}
252227
#endif // __ARM_NEON
253228

@@ -307,11 +282,6 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
307282
_bias0 = vld1q_f32((const float*)bias_data);
308283
_bias1 = vld1q_f32((const float*)bias_data + 4);
309284
}
310-
if (elempack == 4)
311-
{
312-
_bias0 = vld1q_f32((const float*)bias_data);
313-
_bias1 = _bias0;
314-
}
315285
}
316286
#endif // __ARM_NEON
317287

@@ -399,11 +369,6 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_
399369
_scale_in0 = vld1q_f32((const float*)scale_in_data);
400370
_scale_in1 = vld1q_f32((const float*)scale_in_data + 4);
401371
}
402-
if (elempack == 4)
403-
{
404-
_scale_in0 = vld1q_f32((const float*)scale_in_data);
405-
_scale_in1 = _scale_in0;
406-
}
407372
}
408373
#endif // __ARM_NEON
409374

@@ -418,11 +383,6 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_
418383
_scale_out0 = vld1q_f32((const float*)scale_out_data);
419384
_scale_out1 = vld1q_f32((const float*)scale_out_data + 4);
420385
}
421-
if (elempack == 4)
422-
{
423-
_scale_out0 = vld1q_f32((const float*)scale_out_data);
424-
_scale_out1 = _scale_out0;
425-
}
426386
}
427387
#endif // __ARM_NEON
428388

@@ -481,11 +441,6 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_
481441
_bias0 = vld1q_f32((const float*)bias_data);
482442
_bias1 = vld1q_f32((const float*)bias_data + 4);
483443
}
484-
if (elempack == 4)
485-
{
486-
_bias0 = vld1q_f32((const float*)bias_data);
487-
_bias1 = _bias0;
488-
}
489444
}
490445
#endif // __ARM_NEON
491446

src/layer/loongarch/convolution_loongarch.cpp

+23
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,29 @@ int Convolution_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& t
950950
}
951951
}
952952

953+
#if __loongarch_sx
954+
if (opt.use_packing_layout)
955+
{
956+
// NCNN_LOGE("top_blob_int32 %d %d", top_blob_int32.c, top_blob_int32.elempack);
957+
if (use_int8_requantize)
958+
{
959+
// TODO implement winograd sgemm packed int8 pack1 output
960+
if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 1)
961+
{
962+
Mat tmp;
963+
convert_packing(top_blob_int32, tmp, 1, opt);
964+
top_blob_int32 = tmp;
965+
}
966+
if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 0)
967+
{
968+
Mat tmp;
969+
convert_packing(top_blob_int32, tmp, 8, opt);
970+
top_blob_int32 = tmp;
971+
}
972+
}
973+
}
974+
#endif
975+
953976
if (use_int8_requantize)
954977
{
955978
requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);

src/layer/loongarch/requantize_loongarch.cpp

-45
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,6 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
5656
_scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0);
5757
_scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + 4, 0);
5858
}
59-
if (elempack == 4)
60-
{
61-
_scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0);
62-
_scale_in1 = _scale_in0;
63-
}
6459
}
6560
#endif // __loongarch_sx
6661

@@ -75,11 +70,6 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
7570
_scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0);
7671
_scale_out1 = (__m128)__lsx_vld((const float*)scale_out_data + 4, 0);
7772
}
78-
if (elempack == 4)
79-
{
80-
_scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0);
81-
_scale_out1 = _scale_out0;
82-
}
8373
}
8474
#endif // __loongarch_sx
8575

@@ -139,11 +129,6 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
139129
_bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
140130
_bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0);
141131
}
142-
if (elempack == 4)
143-
{
144-
_bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
145-
_bias1 = _bias0;
146-
}
147132
}
148133
#endif // __loongarch_sx
149134

@@ -216,11 +201,6 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
216201
_scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0);
217202
_scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + 4, 0);
218203
}
219-
if (elempack == 4)
220-
{
221-
_scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0);
222-
_scale_in1 = _scale_in0;
223-
}
224204
}
225205
#endif // __loongarch_sx
226206

@@ -235,11 +215,6 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
235215
_scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0);
236216
_scale_out1 = (__m128)__lsx_vld((const float*)scale_out_data + 4, 0);
237217
}
238-
if (elempack == 4)
239-
{
240-
_scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0);
241-
_scale_out1 = _scale_out0;
242-
}
243218
}
244219
#endif // __loongarch_sx
245220

@@ -300,11 +275,6 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
300275
_bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
301276
_bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0);
302277
}
303-
if (elempack == 4)
304-
{
305-
_bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
306-
_bias1 = _bias0;
307-
}
308278
}
309279
#endif // __loongarch_sx
310280

@@ -384,11 +354,6 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_
384354
_scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0);
385355
_scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + 4, 0);
386356
}
387-
if (elempack == 4)
388-
{
389-
_scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0);
390-
_scale_in1 = _scale_in0;
391-
}
392357
}
393358
#endif // __loongarch_sx
394359

@@ -403,11 +368,6 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_
403368
_scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0);
404369
_scale_out1 = (__m128)__lsx_vld((const float*)scale_out_data + 4, 0);
405370
}
406-
if (elempack == 4)
407-
{
408-
_scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0);
409-
_scale_out1 = _scale_out0;
410-
}
411371
}
412372
#endif // __loongarch_sx
413373

@@ -467,11 +427,6 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_
467427
_bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
468428
_bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0);
469429
}
470-
if (elempack == 4)
471-
{
472-
_bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
473-
_bias1 = _bias0;
474-
}
475430
}
476431
#endif // __loongarch_sx
477432

src/layer/mips/convolution_mips.cpp

+23
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,29 @@ int Convolution_mips::forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, c
950950
}
951951
}
952952

953+
#if __mips_msa
954+
if (opt.use_packing_layout)
955+
{
956+
// NCNN_LOGE("top_blob_int32 %d %d", top_blob_int32.c, top_blob_int32.elempack);
957+
if (use_int8_requantize)
958+
{
959+
// TODO implement winograd sgemm packed int8 pack1 output
960+
if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 1)
961+
{
962+
Mat tmp;
963+
convert_packing(top_blob_int32, tmp, 1, opt);
964+
top_blob_int32 = tmp;
965+
}
966+
if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 0)
967+
{
968+
Mat tmp;
969+
convert_packing(top_blob_int32, tmp, 8, opt);
970+
top_blob_int32 = tmp;
971+
}
972+
}
973+
}
974+
#endif
975+
953976
if (use_int8_requantize)
954977
{
955978
requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);

0 commit comments

Comments
 (0)