Skip to content

Commit 331a833

Browse files
committed
bolt v0.3.0 is released
1 parent a193d96 commit 331a833

File tree

598 files changed

+40759
-13573
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

598 files changed

+40759
-13573
lines changed

CMakeLists.txt

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake_minimum_required(VERSION 3.15)
1+
cmake_minimum_required(VERSION 3.2)
22

33
file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake)
44
if (BOLT_CONFIGURE_FILE)
@@ -20,9 +20,11 @@ add_subdirectory(model-tools)
2020
add_subdirectory(tensor_computing)
2121
add_subdirectory(image)
2222
add_subdirectory(inference)
23+
add_subdirectory(tools)
24+
add_subdirectory(kits)
2325
add_subdirectory(tests)
2426
add_custom_target(bolt_library ALL
25-
COMMAND ./scripts/build_light_bolt.sh ${CMAKE_BINARY_DIR} ${USE_MALI}
27+
COMMAND ./CI_SCRIPTS/build_light_bolt.sh ${CMAKE_BINARY_DIR} ${USE_MALI} ${USE_DEBUG} ${USE_LLVM_CLANG} ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP}
2628
WORKING_DIRECTORY $ENV{BOLT_ROOT})
2729

2830
if (USE_MALI)
@@ -38,6 +40,7 @@ add_dependencies(tensor_computing blas-enhance)
3840
add_dependencies(tensor_computing_static blas-enhance_static)
3941
add_dependencies(inference tensor_computing model-tools image)
4042
add_dependencies(inference_static tensor_computing_static model-tools_static image_static)
43+
add_dependencies(bolt_library inference)
4144
add_dependencies(bolt_library inference_static)
4245

4346
install(TARGETS blas-enhance blas-enhance_static
@@ -82,22 +85,34 @@ install(DIRECTORY model-tools/tools/tensorflow2caffe
8285
model-tools/tools/pytorch2caffe
8386
DESTINATION tools)
8487

85-
install(TARGETS tensor_computing_library_search
86-
RUNTIME DESTINATION tools)
88+
if (USE_LIBRARY_TUNING)
89+
install(TARGETS tensor_computing_library_search
90+
RUNTIME DESTINATION tools)
91+
endif (USE_LIBRARY_TUNING)
8792

8893
if (BUILD_TEST)
94+
if (USE_INT8)
95+
install(TARGETS ptq_calibration
96+
RUNTIME DESTINATION tools)
97+
endif(USE_INT8)
8998
install(TARGETS classification
9099
bert
91100
tinybert
92101
nmt
93-
RUNTIME DESTINATION bin)
102+
asr_rnnt
103+
asr_convolution_transformer
104+
tts
105+
vad
106+
RUNTIME DESTINATION kits)
94107
endif(BUILD_TEST)
95108

96109
install(DIRECTORY inference/exports/java
97110
inference/exports/c
98111
DESTINATION include)
99112

100113
install(FILES ${CMAKE_BINARY_DIR}/libBoltModel.so
114+
${CMAKE_BINARY_DIR}/libbolt.a
115+
${CMAKE_BINARY_DIR}/libbolt.so
101116
DESTINATION lib)
102117

103118
execute_process(COMMAND doxygen .Doxyfile WORKING_DIRECTORY $ENV{BOLT_ROOT})
@@ -107,7 +122,7 @@ enable_testing()
107122
find_program (BASH_PROGRAM bash)
108123

109124
if (BASH_PROGRAM)
110-
set(parameters -b $ENV{BOLT_ROOT}/tests/bin -p /data/local/tmp/uldra)
125+
set(parameters -t $ENV{BOLT_ROOT}/tests/bin -k $ENV{BOLT_ROOT}/kits/bin -p /data/local/tmp/uldra)
111126
if (USE_MALI)
112127
set(parameters ${parameters} -g)
113128
endif(USE_MALI)
File renamed without changes.

README.md

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,30 +8,39 @@ Bolt is a light-weight library for mobile devices. Bolt, as a universal deployme
88

99
- ### Overview
1010

11-
Bolt is highly optimized for ARMv8.2 CPUs, supporting fast inference of FP16, INT8 and BNN networks. Recently, FP32 functionality has been integrated, which also works on ARMv8 devices.
11+
Bolt has almost supported all the ARM-A devices incude ARMv7/ARMv8/ARMv8.2/Mali-GPU. FP16/BNN for CPU and FP16 for GPU are highly optimized. Bolt also support FP32 on ARMv7/ARMv8/ARMv8.2 devices.
1212

13-
Bolt has its own format of model storage, which helps reduce the memory footprint by storing in FP16 and 1-bit representations when possible. We provide model converters for the following formats:
13+
Bolt has its own format of model storage, which helps reduce the memory footprint by storing in FP16, INT8 and 1-bit representations when possible. We provide model converters for the following formats:
1414

1515
- caffe
1616
- onnx
1717
- tflite
1818

19-
For PyTorch and TensorFlow models, please try to convert them to the onnx format first. We also had some success in converting these models into customized caffe models.
19+
For PyTorch and TensorFlow models, please try to convert them to the onnx or tflite format first. We also had some success in converting these models into customized caffe models.
2020

2121
- ### Verified Networks
2222

2323
Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md).
2424

25-
- Squeezenet (full-network int8 quantization)
26-
- Mobilenet v1 - v3
25+
- Squeezenet
26+
- Mobilenet v1, v2, v3
2727
- Resnet50, [Ghostnet](https://github.com/huawei-noah/ghostnet) (plus FPN detection)
2828
- Birealnet18 (BNN)
29+
- SSD(Resnet)
2930
- Bert, TinyBert, Albert
3031
- Neural Machine Translation
32+
- Automatic Speech Recognition
33+
- Text To Speech
34+
35+
For MALI GPU FP16 Support
36+
- Squeezenet v1.1
37+
- Mobilenet v1, v2, v3
38+
- Ghostnet
39+
3140

3241
- ### Inference Graph Optimizers
3342

34-
Apart from the refined acceleration of convolutions and GeMM for the supported data precisions, Bolt has a sophisticated inference graph optimizer. As shown in [model-tools/include](model-tools/include), classic operator fusion is supported. Bolt is also equipped with a Memory Reuse Optmizer, which reassigns the space occupied by a feature map as soon as it is no longer needed as input or output. Most networks that we tested benefit from a two-third reduction in feature map storage.
43+
Apart from the refined acceleration of convolutions and GeMM for the supported data precisions, Bolt has a easy use and powerful inference graph optimizer. As shown in [model-tools/include](model-tools/include), classic operator fusion is supported. Bolt is also equipped with a Memory Reuse Optmizer, which reassigns the space occupied by a feature map as soon as it is no longer needed as input or output. Most networks that we tested benefit from a two-third reduction in feature map storage.
3544

3645
- ### Thread Affinity Setting
3746

@@ -93,11 +102,12 @@ We provide a detailed benchmark report for your reference. For more testing info
93102

94103
# Road Map
95104

96-
#### v0.3.0
105+
#### v0.4.0
97106

98-
Future Release 2020-04-01
107+
Future Release 2020-09-01
99108

100-
- GPU
109+
- Yolo support
110+
- TensorFlow model converter
101111

102112
# Who are using Bolt
103113

@@ -106,27 +116,31 @@ Future Release 2020-04-01
106116

107117
# FAQ
108118

109-
1. More details about dependency libraries for cross-compilation?
119+
1. Why configuring bolt.cmake does not take effect?
120+
121+
The [install.sh](install.sh) serves as an example of compilation setup, and it overwrites some settings in [bolt.cmake](bolt.cmake). Please check install.sh first.
122+
123+
2. More details about dependency libraries for cross-compilation?
110124

111125
The major dependency is Protobuf. Protoc should be the x86 version but protbuf should be the ARM version.
112126

113-
2. Requirements on tensor dimensions?
127+
3. Requirements on tensor dimensions?
114128

115-
For optimal performance, Bolt requires the number of output channels to be divisible by 8. For compatibility, Bolt will try to pad the output channels of convolution layers to the nearest multiple of 8. You can turn on DEBUG in [bolt.cmake](bolt.cmake) to check the actual dimensions.
129+
For optimal performance, Bolt requires the number of output channels to be divisible by 8. For compatibility, Bolt will try to pad the output channels of convolution layers to the nearest multiple of 8. You can turn on USE_DEBUG in [bolt.cmake](bolt.cmake) to check the actual dimensions.
116130

117-
3. Restrictions for BNN?
131+
4. Restrictions for BNN?
118132

119133
For BNN convolution layers, the number of output channels must be divisible by 32.
120134

121-
4. Restrictions on quantization (int8)?
135+
5. Restrictions on quantization (int8)?
122136

123-
For the time being, Bolt only supports post-training int8 quantization. If quantization is activated, the second convolution layer will quantize the tensors to 8-bit integers. For now, int8 operators include Convolution, Pooling and Concatenation (end-to-end support for Squeezenet). If your network includes other operators, you may need to add type casting in the front of those operators. The quantization method is symmetrical for both activation and weight.
137+
For the time being, Bolt only supports post-training int8 quantization. The quantization method is symmetrical for both activation and weight. We have added a calibration tool for image CNN pipelines. Please feel free to report cases of usage failures.
124138

125-
5. Requirements for fp16 and int8?
139+
6. Requirements for fp16 and int8?
126140

127141
Only arm-v8.2 supports fp16 and int8 dotprod instructions.
128142

129-
6. Restrictions for MALI?
143+
7. Restrictions for MALI?
130144

131145
Only llvm compilation supports MALI computing.
132146

File renamed without changes.

blas-enhance/include/blas-enhance.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,31 @@ extern "C" {
3636
U32 bytes, void* tmp,
3737
TensorDesc resultDesc, void* result, Arch arch);
3838

39+
inline DataFormat targetFormat4MatrixB(DataType dt)
40+
{
41+
switch (dt) {
42+
case DT_F16: {
43+
return DF_NKN24;
44+
}
45+
case DT_F32: {
46+
#ifdef __aarch64__
47+
return DF_NKN12;
48+
#else
49+
return DF_NKN8;
50+
#endif
51+
}
52+
case DT_I8: {
53+
return DF_NKN12K4;
54+
}
55+
default: {
56+
CHECK_STATUS(NOT_SUPPORTED);
57+
exit(1);
58+
}
59+
}
60+
}
61+
62+
EE matrix_matrix_multiply_transform_rhs(TensorDesc desc, const void* src, TensorDesc* descTran,void* dst);
63+
3964
#ifdef __cplusplus
4065
}
4166
#endif

blas-enhance/src/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp)
1+
if (USE_GENERAL)
2+
file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp)
3+
endif (USE_GENERAL)
4+
25
if (USE_NEON)
36
if (USE_FP16)
47
file(GLOB arm_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp16/*.cpp)

blas-enhance/src/cpu/arm/arm_neon_expand.h

Lines changed: 0 additions & 78 deletions
This file was deleted.

blas-enhance/src/cpu/arm/blas_arm.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#ifndef _H_BLAS_ARM
1616
#define _H_BLAS_ARM
1717

18+
#include "error.h"
1819
#include "sys.h"
1920
#include "type.h"
2021

@@ -37,4 +38,13 @@ EE mmm_arm(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K,
3738
void* matrixCData,
3839
Arch arch);
3940

41+
inline U32 pad_to_4_multiple(U32 k)
42+
{
43+
if (k % 4 == 0) {
44+
return k;
45+
} else {
46+
return (k / 4) * 4 + 4;
47+
}
48+
}
49+
4050
#endif

blas-enhance/src/cpu/arm/fp16/blas_fp16.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,17 @@
1818
#include "sys.h"
1919
#include "type.h"
2020
#include "error.h"
21+
#include "tensor_desc.h"
2122

2223

2324
EE mvm_fp16(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result, Arch arch);
2425

2526
void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes);
2627

28+
EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst);
29+
30+
EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst);
31+
2732
EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch);
2833

2934
#endif

blas-enhance/src/cpu/arm/fp16/mmm.cpp

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,59 @@
1616
#include "error.h"
1717
#include "cpu/arm/fp16/blas_fp16.h"
1818
#include "mmm.h"
19+
#include "mmm_common.h"
1920

2021

2122
void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes)
2223
{
2324
*bytes = row1 * col1 + row2 * col2;
24-
*bytes *= bytesOf (dt);
25+
*bytes *= bytesOf(dt);
2526
*bytes += 32;
2627
}
2728

29+
EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst)
30+
{
31+
DataType dt;
32+
U32 N, K;
33+
CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N));
34+
int i = 0;
35+
for (; i < (int)N - 23; i += 24) {
36+
matrix2_trans(24, K, N, src + i, dst + i * K);
37+
}
38+
for (; i < (int)N - 7; i += 8) {
39+
matrix2_trans(8, K, N, src + i, dst + i * K);
40+
}
41+
for (; i < (int)N - 3; i += 4) {
42+
matrix2_trans(4, K, N, src + i, dst + i * K);
43+
}
44+
if ((int)N > i) {
45+
matrix2_trans(N - i, K, N, src + i, dst + i * K);
46+
}
47+
return SUCCESS;
48+
}
49+
50+
EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst)
51+
{
52+
DataType dt;
53+
U32 N, K;
54+
CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K));
55+
int i = 0;
56+
for (; i < (int)N - 23; i += 24) {
57+
matrix1_trans(24, K, K, src + i * K, dst + i * K);
58+
}
59+
for (; i < (int)N - 7; i += 8) {
60+
matrix1_trans(8, K, K, src + i * K, dst + i * K);
61+
}
62+
for (; i < (int)N - 3; i += 4) {
63+
matrix1_trans(4, K, K, src + i * K, dst + i * K);
64+
}
65+
if ((int)N > i) {
66+
matrix1_trans(N - i, K, K, src + i * K, dst + i * K);
67+
}
68+
return SUCCESS;
69+
}
70+
71+
2872
EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch)
2973
{
3074
EE ret = SUCCESS;

0 commit comments

Comments
 (0)