6
6
#include < benchmark/benchmark.h>
7
7
8
8
#include < cassert>
9
- #include < cstddef>
10
- #include < cstdint>
11
- #include < cstdlib>
12
- #include < cstring>
13
9
#include < functional>
14
- #include < memory>
15
10
#include < vector>
16
11
12
+ #include " bench/subgraph/model_runtime.h"
17
13
#include " bench/subgraph/models.h"
18
14
#include " bench/utils.h"
19
15
#include " include/xnnpack.h"
20
- #include " src/xnnpack/allocator.h"
21
- #include " src/xnnpack/subgraph.h"
22
- #include < pthreadpool.h>
23
-
24
- struct ModelRuntime {
25
- std::unique_ptr<xnn_subgraph, decltype (&xnn_delete_subgraph)> model;
26
- pthreadpool_t threadpool = nullptr ;
27
- xnn_runtime_t runtime = nullptr ;
28
- std::vector<xnn_external_value> external_values;
29
-
30
- explicit ModelRuntime (int num_threads) : model(nullptr , xnn_delete_subgraph) {
31
- xnn_delete_runtime (runtime);
32
- threadpool = pthreadpool_create (num_threads);
33
- }
34
-
35
- ~ModelRuntime () {
36
- if (runtime) {
37
- xnn_delete_runtime (runtime);
38
- }
39
- if (threadpool) {
40
- pthreadpool_destroy (threadpool);
41
- }
42
- for (xnn_external_value& i : external_values) {
43
- xnn_release_simd_memory (i.data );
44
- }
45
- }
46
-
47
- bool CreateModel (std::function<xnn_subgraph_t ()> model_factory) {
48
- model.reset (model_factory ());
49
- if (!model) {
50
- return false ;
51
- }
52
- for (uint32_t i = 0 ; i < model->num_values ; ++i) {
53
- if ((model->values [i].flags & (XNN_VALUE_FLAG_EXTERNAL_INPUT |
54
- XNN_VALUE_FLAG_EXTERNAL_OUTPUT)) == 0 ) {
55
- continue ;
56
- }
57
- // Make a buffer for this external value.
58
- size_t size = xnn_tensor_get_size (&model->values [i]) + XNN_EXTRA_BYTES;
59
- external_values.push_back (
60
- xnn_external_value{i, xnn_allocate_zero_simd_memory (size)});
61
- }
62
- return model != nullptr ;
63
- }
64
-
65
- bool CreateRuntime (uint32_t flags) {
66
- assert (!runtime);
67
- return xnn_status_success == xnn_create_runtime_v4 (model.get (), nullptr ,
68
- nullptr , threadpool,
69
- flags, &runtime);
70
- }
71
- bool ReshapeRuntime () {
72
- return xnn_status_success == xnn_reshape_runtime (runtime);
73
- }
74
-
75
- bool SetupRuntime () {
76
- return xnn_status_success == xnn_setup_runtime_v2 (runtime,
77
- external_values.size (),
78
- external_values.data ());
79
- }
80
-
81
- bool Invoke () { return xnn_status_success == xnn_invoke_runtime (runtime); }
82
- };
83
-
84
- static void BenchmarkInvoke (benchmark::State& state,
85
- std::function<xnn_subgraph_t ()> model_factory,
86
- uint32_t extra_flags = 0) {
87
- if (xnn_initialize (nullptr /* allocator */ ) != xnn_status_success) {
88
- state.SkipWithError (" failed to initialize XNNPACK" );
89
- return ;
90
- }
91
-
92
- ModelRuntime model_runtime (FLAGS_num_threads);
93
- if (!model_runtime.CreateModel (model_factory)) {
94
- state.SkipWithError (" failed to create model" );
95
- return ;
96
- }
97
-
98
- // TODO(dsharlet): We should have benchmarks of these steps too.
99
- if (!model_runtime.CreateRuntime (FLAGS_xnn_runtime_flags | extra_flags)) {
100
- state.SkipWithError (" failed to create runtime" );
101
- return ;
102
- }
103
-
104
- if (!model_runtime.ReshapeRuntime ()) {
105
- state.SkipWithError (" failed to reshape runtime" );
106
- return ;
107
- }
108
-
109
- if (!model_runtime.SetupRuntime ()) {
110
- state.SkipWithError (" failed to setup runtime" );
111
- return ;
112
- }
113
-
114
- int num_iters = FLAGS_benchmark_min_iters;
115
- while (state.KeepRunningBatch (num_iters)) {
116
- for (int iter = 0 ; iter < num_iters; iter++) {
117
- benchmark::utils::WipePthreadpoolL2Caches (state,
118
- model_runtime.threadpool );
119
- if (!model_runtime.Invoke ()) {
120
- state.SkipWithError (" failed to invoke runtime" );
121
- return ;
122
- }
123
- }
124
- num_iters = 1 ;
125
- }
126
-
127
- const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency ();
128
- if (cpu_frequency != 0 ) {
129
- state.counters [" cpufreq" ] = cpu_frequency;
130
- }
131
- }
132
16
133
17
static void FP32Attention (benchmark::State& state) {
134
- BenchmarkInvoke (state, [&state]() {
18
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
135
19
return models::FP32Attention (FLAGS_batch_size, state.range (0 ),
136
20
state.range (1 ), state.range (2 ),
137
21
state.range (3 ));
138
22
});
139
23
}
140
24
141
25
static void FP16Attention (benchmark::State& state) {
142
- BenchmarkInvoke (
26
+ xnnpack::ModelRuntime:: BenchmarkInvoke (
143
27
state,
144
28
[&state]() {
145
29
return models::FP32Attention (FLAGS_batch_size, state.range (0 ),
@@ -150,101 +34,101 @@ static void FP16Attention(benchmark::State& state) {
150
34
}
151
35
152
36
static void FP32MobileNetV1 (benchmark::State& state) {
153
- BenchmarkInvoke (state, models::FP32MobileNetV1);
37
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV1);
154
38
}
155
39
156
40
static void FP32MobileNetV2 (benchmark::State& state) {
157
- BenchmarkInvoke (state, models::FP32MobileNetV2);
41
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV2);
158
42
}
159
43
160
44
static void FP32MobileNetV3Large (benchmark::State& state) {
161
- BenchmarkInvoke (state, models::FP32MobileNetV3Large);
45
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV3Large);
162
46
}
163
47
164
48
static void FP32MobileNetV3Small (benchmark::State& state) {
165
- BenchmarkInvoke (state, models::FP32MobileNetV3Small);
49
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV3Small);
166
50
}
167
51
168
52
static void FP16MobileNetV1 (benchmark::State& state) {
169
- BenchmarkInvoke (state, models::FP32MobileNetV1,
170
- XNN_FLAG_FORCE_FP16_INFERENCE);
53
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV1,
54
+ XNN_FLAG_FORCE_FP16_INFERENCE);
171
55
}
172
56
173
57
static void FP16MobileNetV2 (benchmark::State& state) {
174
- BenchmarkInvoke (state, models::FP32MobileNetV2,
175
- XNN_FLAG_FORCE_FP16_INFERENCE);
58
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV2,
59
+ XNN_FLAG_FORCE_FP16_INFERENCE);
176
60
}
177
61
178
62
static void FP16MobileNetV3Large (benchmark::State& state) {
179
- BenchmarkInvoke (state, models::FP32MobileNetV3Large,
180
- XNN_FLAG_FORCE_FP16_INFERENCE);
63
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV3Large,
64
+ XNN_FLAG_FORCE_FP16_INFERENCE);
181
65
}
182
66
183
67
static void FP16MobileNetV3Small (benchmark::State& state) {
184
- BenchmarkInvoke (state, models::FP32MobileNetV3Small,
185
- XNN_FLAG_FORCE_FP16_INFERENCE);
68
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV3Small,
69
+ XNN_FLAG_FORCE_FP16_INFERENCE);
186
70
}
187
71
188
72
static void QD8Attention (benchmark::State& state) {
189
73
models::QD8AttentionWeights weights;
190
- BenchmarkInvoke (state, [&state, &weights]() {
74
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state, &weights]() {
191
75
return models::QD8Attention (FLAGS_batch_size, state.range (0 ),
192
76
state.range (1 ), state.range (2 ), state.range (3 ),
193
77
weights);
194
78
});
195
79
}
196
80
197
81
static void QS8MobileNetV2 (benchmark::State& state) {
198
- BenchmarkInvoke (state, models::QS8MobileNetV2);
82
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::QS8MobileNetV2);
199
83
}
200
84
201
85
static void FP32Elementwise (benchmark::State& state) {
202
- BenchmarkInvoke (state, [&state]() {
86
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
203
87
return models::FP32Elementwise (/* batch_size=*/ state.range (0 ),
204
88
/* num_elements=*/ state.range (1 ),
205
89
/* depth=*/ state.range (2 ));
206
90
});
207
91
}
208
92
209
93
static void FP32LayerNorm (benchmark::State& state) {
210
- BenchmarkInvoke (state, [&state]() {
94
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
211
95
return models::FP32LayerNorm (state.range (0 ), state.range (1 ), state.range (2 ),
212
96
state.range (3 ));
213
97
});
214
98
}
215
99
216
100
static void FP32L2Norm (benchmark::State& state) {
217
- BenchmarkInvoke (state, [&state]() {
101
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
218
102
return models::FP32L2Norm (state.range (0 ), state.range (1 ), state.range (2 ),
219
103
state.range (3 ));
220
104
});
221
105
}
222
106
223
107
static void FP32SoftmaxDecomp (benchmark::State& state) {
224
- BenchmarkInvoke (state, [&state]() {
108
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
225
109
return models::FP32Softmax (state.range (0 ), state.range (1 ), state.range (2 ),
226
110
state.range (3 ), /* use_softmax=*/ false );
227
111
});
228
112
}
229
113
230
114
static void FP32SoftmaxFused (benchmark::State& state) {
231
- BenchmarkInvoke (state, [&state]() {
115
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
232
116
return models::FP32Softmax (state.range (0 ), state.range (1 ), state.range (2 ),
233
117
state.range (3 ), /* use_softmax=*/ true );
234
118
});
235
119
}
236
120
237
121
static void FP32DepthwiseSeparable (benchmark::State& state) {
238
122
models::FP32DepthwiseSeparableWeights weights;
239
- BenchmarkInvoke (state, [&state, &weights]() {
123
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state, &weights]() {
240
124
return models::FP32DepthwiseSeparable (state.range (0 ), state.range (1 ),
241
125
state.range (2 ), state.range (3 ),
242
126
state.range (4 ), weights);
243
127
});
244
128
}
245
129
246
130
static void QD8TransformerBlock (benchmark::State& state) {
247
- BenchmarkInvoke (state, [&state]() {
131
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
248
132
return models::QD8TransformerBlock (
249
133
FLAGS_batch_size, /* sequence_length=*/ state.range (0 ),
250
134
/* embedding_dim=*/ state.range (1 ), /* num_heads=*/ state.range (2 ),
@@ -253,7 +137,7 @@ static void QD8TransformerBlock(benchmark::State& state) {
253
137
}
254
138
255
139
static void FP32TransformerBlock (benchmark::State& state) {
256
- BenchmarkInvoke (state, [&state]() {
140
+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
257
141
return models::FP32TransformerBlock (
258
142
FLAGS_batch_size, /* sequence_length=*/ state.range (0 ),
259
143
/* embedding_dim=*/ state.range (1 ), /* num_heads=*/ state.range (2 ),
@@ -262,7 +146,7 @@ static void FP32TransformerBlock(benchmark::State& state) {
262
146
}
263
147
264
148
static void FP16TransformerBlock (benchmark::State& state) {
265
- BenchmarkInvoke (
149
+ xnnpack::ModelRuntime:: BenchmarkInvoke (
266
150
state,
267
151
[&state]() {
268
152
return models::FP32TransformerBlock (
0 commit comments