66#include < benchmark/benchmark.h>
77
88#include < cassert>
9- #include < cstddef>
10- #include < cstdint>
11- #include < cstdlib>
12- #include < cstring>
139#include < functional>
14- #include < memory>
1510#include < vector>
1611
12+ #include " bench/subgraph/model_runtime.h"
1713#include " bench/subgraph/models.h"
1814#include " bench/utils.h"
1915#include " include/xnnpack.h"
20- #include " src/xnnpack/allocator.h"
21- #include " src/xnnpack/subgraph.h"
22- #include < pthreadpool.h>
23-
24- struct ModelRuntime {
25- std::unique_ptr<xnn_subgraph, decltype (&xnn_delete_subgraph)> model;
26- pthreadpool_t threadpool = nullptr ;
27- xnn_runtime_t runtime = nullptr ;
28- std::vector<xnn_external_value> external_values;
29-
30- explicit ModelRuntime (int num_threads) : model(nullptr , xnn_delete_subgraph) {
31- xnn_delete_runtime (runtime);
32- threadpool = pthreadpool_create (num_threads);
33- }
34-
35- ~ModelRuntime () {
36- if (runtime) {
37- xnn_delete_runtime (runtime);
38- }
39- if (threadpool) {
40- pthreadpool_destroy (threadpool);
41- }
42- for (xnn_external_value& i : external_values) {
43- xnn_release_simd_memory (i.data );
44- }
45- }
46-
47- bool CreateModel (std::function<xnn_subgraph_t ()> model_factory) {
48- model.reset (model_factory ());
49- if (!model) {
50- return false ;
51- }
52- for (uint32_t i = 0 ; i < model->num_values ; ++i) {
53- if ((model->values [i].flags & (XNN_VALUE_FLAG_EXTERNAL_INPUT |
54- XNN_VALUE_FLAG_EXTERNAL_OUTPUT)) == 0 ) {
55- continue ;
56- }
57- // Make a buffer for this external value.
58- size_t size = xnn_tensor_get_size (&model->values [i]) + XNN_EXTRA_BYTES;
59- external_values.push_back (
60- xnn_external_value{i, xnn_allocate_zero_simd_memory (size)});
61- }
62- return model != nullptr ;
63- }
64-
65- bool CreateRuntime (uint32_t flags) {
66- assert (!runtime);
67- return xnn_status_success == xnn_create_runtime_v4 (model.get (), nullptr ,
68- nullptr , threadpool,
69- flags, &runtime);
70- }
71- bool ReshapeRuntime () {
72- return xnn_status_success == xnn_reshape_runtime (runtime);
73- }
74-
75- bool SetupRuntime () {
76- return xnn_status_success == xnn_setup_runtime_v2 (runtime,
77- external_values.size (),
78- external_values.data ());
79- }
80-
81- bool Invoke () { return xnn_status_success == xnn_invoke_runtime (runtime); }
82- };
83-
84- static void BenchmarkInvoke (benchmark::State& state,
85- std::function<xnn_subgraph_t ()> model_factory,
86- uint32_t extra_flags = 0) {
87- if (xnn_initialize (nullptr /* allocator */ ) != xnn_status_success) {
88- state.SkipWithError (" failed to initialize XNNPACK" );
89- return ;
90- }
91-
92- ModelRuntime model_runtime (FLAGS_num_threads);
93- if (!model_runtime.CreateModel (model_factory)) {
94- state.SkipWithError (" failed to create model" );
95- return ;
96- }
97-
98- // TODO(dsharlet): We should have benchmarks of these steps too.
99- if (!model_runtime.CreateRuntime (FLAGS_xnn_runtime_flags | extra_flags)) {
100- state.SkipWithError (" failed to create runtime" );
101- return ;
102- }
103-
104- if (!model_runtime.ReshapeRuntime ()) {
105- state.SkipWithError (" failed to reshape runtime" );
106- return ;
107- }
108-
109- if (!model_runtime.SetupRuntime ()) {
110- state.SkipWithError (" failed to setup runtime" );
111- return ;
112- }
113-
114- int num_iters = FLAGS_benchmark_min_iters;
115- while (state.KeepRunningBatch (num_iters)) {
116- for (int iter = 0 ; iter < num_iters; iter++) {
117- benchmark::utils::WipePthreadpoolL2Caches (state,
118- model_runtime.threadpool );
119- if (!model_runtime.Invoke ()) {
120- state.SkipWithError (" failed to invoke runtime" );
121- return ;
122- }
123- }
124- num_iters = 1 ;
125- }
126-
127- const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency ();
128- if (cpu_frequency != 0 ) {
129- state.counters [" cpufreq" ] = cpu_frequency;
130- }
131- }
13216
13317static void FP32Attention (benchmark::State& state) {
134- BenchmarkInvoke (state, [&state]() {
18+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
13519 return models::FP32Attention (FLAGS_batch_size, state.range (0 ),
13620 state.range (1 ), state.range (2 ),
13721 state.range (3 ));
13822 });
13923}
14024
14125static void FP16Attention (benchmark::State& state) {
142- BenchmarkInvoke (
26+ xnnpack::ModelRuntime:: BenchmarkInvoke (
14327 state,
14428 [&state]() {
14529 return models::FP32Attention (FLAGS_batch_size, state.range (0 ),
@@ -150,101 +34,101 @@ static void FP16Attention(benchmark::State& state) {
15034}
15135
15236static void FP32MobileNetV1 (benchmark::State& state) {
153- BenchmarkInvoke (state, models::FP32MobileNetV1);
37+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV1);
15438}
15539
15640static void FP32MobileNetV2 (benchmark::State& state) {
157- BenchmarkInvoke (state, models::FP32MobileNetV2);
41+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV2);
15842}
15943
16044static void FP32MobileNetV3Large (benchmark::State& state) {
161- BenchmarkInvoke (state, models::FP32MobileNetV3Large);
45+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV3Large);
16246}
16347
16448static void FP32MobileNetV3Small (benchmark::State& state) {
165- BenchmarkInvoke (state, models::FP32MobileNetV3Small);
49+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV3Small);
16650}
16751
16852static void FP16MobileNetV1 (benchmark::State& state) {
169- BenchmarkInvoke (state, models::FP32MobileNetV1,
170- XNN_FLAG_FORCE_FP16_INFERENCE);
53+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV1,
54+ XNN_FLAG_FORCE_FP16_INFERENCE);
17155}
17256
17357static void FP16MobileNetV2 (benchmark::State& state) {
174- BenchmarkInvoke (state, models::FP32MobileNetV2,
175- XNN_FLAG_FORCE_FP16_INFERENCE);
58+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV2,
59+ XNN_FLAG_FORCE_FP16_INFERENCE);
17660}
17761
17862static void FP16MobileNetV3Large (benchmark::State& state) {
179- BenchmarkInvoke (state, models::FP32MobileNetV3Large,
180- XNN_FLAG_FORCE_FP16_INFERENCE);
63+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV3Large,
64+ XNN_FLAG_FORCE_FP16_INFERENCE);
18165}
18266
18367static void FP16MobileNetV3Small (benchmark::State& state) {
184- BenchmarkInvoke (state, models::FP32MobileNetV3Small,
185- XNN_FLAG_FORCE_FP16_INFERENCE);
68+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::FP32MobileNetV3Small,
69+ XNN_FLAG_FORCE_FP16_INFERENCE);
18670}
18771
18872static void QD8Attention (benchmark::State& state) {
18973 models::QD8AttentionWeights weights;
190- BenchmarkInvoke (state, [&state, &weights]() {
74+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state, &weights]() {
19175 return models::QD8Attention (FLAGS_batch_size, state.range (0 ),
19276 state.range (1 ), state.range (2 ), state.range (3 ),
19377 weights);
19478 });
19579}
19680
19781static void QS8MobileNetV2 (benchmark::State& state) {
198- BenchmarkInvoke (state, models::QS8MobileNetV2);
82+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, models::QS8MobileNetV2);
19983}
20084
20185static void FP32Elementwise (benchmark::State& state) {
202- BenchmarkInvoke (state, [&state]() {
86+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
20387 return models::FP32Elementwise (/* batch_size=*/ state.range (0 ),
20488 /* num_elements=*/ state.range (1 ),
20589 /* depth=*/ state.range (2 ));
20690 });
20791}
20892
20993static void FP32LayerNorm (benchmark::State& state) {
210- BenchmarkInvoke (state, [&state]() {
94+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
21195 return models::FP32LayerNorm (state.range (0 ), state.range (1 ), state.range (2 ),
21296 state.range (3 ));
21397 });
21498}
21599
216100static void FP32L2Norm (benchmark::State& state) {
217- BenchmarkInvoke (state, [&state]() {
101+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
218102 return models::FP32L2Norm (state.range (0 ), state.range (1 ), state.range (2 ),
219103 state.range (3 ));
220104 });
221105}
222106
223107static void FP32SoftmaxDecomp (benchmark::State& state) {
224- BenchmarkInvoke (state, [&state]() {
108+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
225109 return models::FP32Softmax (state.range (0 ), state.range (1 ), state.range (2 ),
226110 state.range (3 ), /* use_softmax=*/ false );
227111 });
228112}
229113
230114static void FP32SoftmaxFused (benchmark::State& state) {
231- BenchmarkInvoke (state, [&state]() {
115+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
232116 return models::FP32Softmax (state.range (0 ), state.range (1 ), state.range (2 ),
233117 state.range (3 ), /* use_softmax=*/ true );
234118 });
235119}
236120
237121static void FP32DepthwiseSeparable (benchmark::State& state) {
238122 models::FP32DepthwiseSeparableWeights weights;
239- BenchmarkInvoke (state, [&state, &weights]() {
123+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state, &weights]() {
240124 return models::FP32DepthwiseSeparable (state.range (0 ), state.range (1 ),
241125 state.range (2 ), state.range (3 ),
242126 state.range (4 ), weights);
243127 });
244128}
245129
246130static void QD8TransformerBlock (benchmark::State& state) {
247- BenchmarkInvoke (state, [&state]() {
131+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
248132 return models::QD8TransformerBlock (
249133 FLAGS_batch_size, /* sequence_length=*/ state.range (0 ),
250134 /* embedding_dim=*/ state.range (1 ), /* num_heads=*/ state.range (2 ),
@@ -253,7 +137,7 @@ static void QD8TransformerBlock(benchmark::State& state) {
253137}
254138
255139static void FP32TransformerBlock (benchmark::State& state) {
256- BenchmarkInvoke (state, [&state]() {
140+ xnnpack::ModelRuntime:: BenchmarkInvoke (state, [&state]() {
257141 return models::FP32TransformerBlock (
258142 FLAGS_batch_size, /* sequence_length=*/ state.range (0 ),
259143 /* embedding_dim=*/ state.range (1 ), /* num_heads=*/ state.range (2 ),
@@ -262,7 +146,7 @@ static void FP32TransformerBlock(benchmark::State& state) {
262146}
263147
264148static void FP16TransformerBlock (benchmark::State& state) {
265- BenchmarkInvoke (
149+ xnnpack::ModelRuntime:: BenchmarkInvoke (
266150 state,
267151 [&state]() {
268152 return models::FP32TransformerBlock (
0 commit comments