Skip to content

Commit 72c19a2

Browse files
gonnetxnnpack-bot
authored andcommitted
Add subgraph benchmarks for variants of the fully-connected op.
PiperOrigin-RevId: 767275232
1 parent 2687503 commit 72c19a2

File tree

8 files changed

+486
-145
lines changed

8 files changed

+486
-145
lines changed

bench/subgraph/BUILD

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,20 @@ load(
1010
"xnnpack_slow_benchmark_tags",
1111
)
1212

13+
xnnpack_cxx_library(
14+
name = "model_runtime",
15+
srcs = ["model_runtime.cc"],
16+
hdrs = ["model_runtime.h"],
17+
deps = [
18+
"//:XNNPACK",
19+
"//:allocator",
20+
"//:subgraph_h",
21+
"//bench:bench_utils",
22+
"@com_google_benchmark//:benchmark",
23+
"@pthreadpool",
24+
],
25+
)
26+
1327
xnnpack_cxx_library(
1428
name = "models",
1529
testonly = 1,
@@ -42,11 +56,21 @@ xnnpack_benchmark(
4256
srcs = ["benchmark.cc"],
4357
tags = xnnpack_slow_benchmark_tags(),
4458
deps = [
59+
":model_runtime",
4560
":models",
46-
"//:allocator",
47-
"//:subgraph",
4861
"//:xnnpack_h",
4962
"//bench:bench_utils",
50-
"@pthreadpool",
63+
],
64+
)
65+
66+
xnnpack_benchmark(
67+
name = "fully_connected",
68+
srcs = ["fully_connected.cc"],
69+
tags = xnnpack_slow_benchmark_tags(),
70+
deps = [
71+
":model_runtime",
72+
"//:xnnpack_h",
73+
"//bench:bench_utils",
74+
"//test:next_prime",
5175
],
5276
)

bench/subgraph/CMakeLists.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,23 @@ IF(XNNPACK_BUILD_LIBRARY)
2626
SET_TARGET_PROPERTIES(models PROPERTIES CXX_EXTENSIONS YES)
2727
TARGET_LINK_LIBRARIES(models PRIVATE XNNPACK)
2828

29+
ADD_LIBRARY(model_runtime STATIC
30+
model_runtime.cc)
31+
SET_TARGET_PROPERTIES(model_runtime PROPERTIES CXX_EXTENSIONS YES)
32+
TARGET_LINK_LIBRARIES(model_runtime PRIVATE XNNPACK)
33+
2934
ADD_EXECUTABLE(bench-models benchmark.cc)
3035
TARGET_LINK_LIBRARIES(bench-models PRIVATE
3136
bench-utils
3237
benchmark::benchmark
3338
models
39+
model_runtime
40+
XNNPACK)
41+
42+
ADD_EXECUTABLE(bench-fully-connected fully_connected.cc)
43+
TARGET_LINK_LIBRARIES(bench-fully-connected PRIVATE
44+
bench-utils
45+
benchmark::benchmark
46+
model_runtime
3447
XNNPACK)
3548
ENDIF()

bench/subgraph/benchmark.cc

Lines changed: 26 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -6,140 +6,24 @@
66
#include <benchmark/benchmark.h>
77

88
#include <cassert>
9-
#include <cstddef>
10-
#include <cstdint>
11-
#include <cstdlib>
12-
#include <cstring>
139
#include <functional>
14-
#include <memory>
1510
#include <vector>
1611

12+
#include "bench/subgraph/model_runtime.h"
1713
#include "bench/subgraph/models.h"
1814
#include "bench/utils.h"
1915
#include "include/xnnpack.h"
20-
#include "src/xnnpack/allocator.h"
21-
#include "src/xnnpack/subgraph.h"
22-
#include <pthreadpool.h>
23-
24-
struct ModelRuntime {
25-
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> model;
26-
pthreadpool_t threadpool = nullptr;
27-
xnn_runtime_t runtime = nullptr;
28-
std::vector<xnn_external_value> external_values;
29-
30-
explicit ModelRuntime(int num_threads) : model(nullptr, xnn_delete_subgraph) {
31-
xnn_delete_runtime(runtime);
32-
threadpool = pthreadpool_create(num_threads);
33-
}
34-
35-
~ModelRuntime() {
36-
if (runtime) {
37-
xnn_delete_runtime(runtime);
38-
}
39-
if (threadpool) {
40-
pthreadpool_destroy(threadpool);
41-
}
42-
for (xnn_external_value& i : external_values) {
43-
xnn_release_simd_memory(i.data);
44-
}
45-
}
46-
47-
bool CreateModel(std::function<xnn_subgraph_t()> model_factory) {
48-
model.reset(model_factory());
49-
if (!model) {
50-
return false;
51-
}
52-
for (uint32_t i = 0; i < model->num_values; ++i) {
53-
if ((model->values[i].flags & (XNN_VALUE_FLAG_EXTERNAL_INPUT |
54-
XNN_VALUE_FLAG_EXTERNAL_OUTPUT)) == 0) {
55-
continue;
56-
}
57-
// Make a buffer for this external value.
58-
size_t size = xnn_tensor_get_size(&model->values[i]) + XNN_EXTRA_BYTES;
59-
external_values.push_back(
60-
xnn_external_value{i, xnn_allocate_zero_simd_memory(size)});
61-
}
62-
return model != nullptr;
63-
}
64-
65-
bool CreateRuntime(uint32_t flags) {
66-
assert(!runtime);
67-
return xnn_status_success == xnn_create_runtime_v4(model.get(), nullptr,
68-
nullptr, threadpool,
69-
flags, &runtime);
70-
}
71-
bool ReshapeRuntime() {
72-
return xnn_status_success == xnn_reshape_runtime(runtime);
73-
}
74-
75-
bool SetupRuntime() {
76-
return xnn_status_success == xnn_setup_runtime_v2(runtime,
77-
external_values.size(),
78-
external_values.data());
79-
}
80-
81-
bool Invoke() { return xnn_status_success == xnn_invoke_runtime(runtime); }
82-
};
83-
84-
static void BenchmarkInvoke(benchmark::State& state,
85-
std::function<xnn_subgraph_t()> model_factory,
86-
uint32_t extra_flags = 0) {
87-
if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
88-
state.SkipWithError("failed to initialize XNNPACK");
89-
return;
90-
}
91-
92-
ModelRuntime model_runtime(FLAGS_num_threads);
93-
if (!model_runtime.CreateModel(model_factory)) {
94-
state.SkipWithError("failed to create model");
95-
return;
96-
}
97-
98-
// TODO(dsharlet): We should have benchmarks of these steps too.
99-
if (!model_runtime.CreateRuntime(FLAGS_xnn_runtime_flags | extra_flags)) {
100-
state.SkipWithError("failed to create runtime");
101-
return;
102-
}
103-
104-
if (!model_runtime.ReshapeRuntime()) {
105-
state.SkipWithError("failed to reshape runtime");
106-
return;
107-
}
108-
109-
if (!model_runtime.SetupRuntime()) {
110-
state.SkipWithError("failed to setup runtime");
111-
return;
112-
}
113-
114-
int num_iters = FLAGS_benchmark_min_iters;
115-
while (state.KeepRunningBatch(num_iters)) {
116-
for (int iter = 0; iter < num_iters; iter++) {
117-
benchmark::utils::WipePthreadpoolL2Caches(state,
118-
model_runtime.threadpool);
119-
if (!model_runtime.Invoke()) {
120-
state.SkipWithError("failed to invoke runtime");
121-
return;
122-
}
123-
}
124-
num_iters = 1;
125-
}
126-
127-
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
128-
if (cpu_frequency != 0) {
129-
state.counters["cpufreq"] = cpu_frequency;
130-
}
131-
}
13216

13317
static void FP32Attention(benchmark::State& state) {
134-
BenchmarkInvoke(state, [&state]() {
18+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
13519
return models::FP32Attention(FLAGS_batch_size, state.range(0),
13620
state.range(1), state.range(2),
13721
state.range(3));
13822
});
13923
}
14024

14125
static void FP16Attention(benchmark::State& state) {
142-
BenchmarkInvoke(
26+
xnnpack::ModelRuntime::BenchmarkInvoke(
14327
state,
14428
[&state]() {
14529
return models::FP32Attention(FLAGS_batch_size, state.range(0),
@@ -150,101 +34,101 @@ static void FP16Attention(benchmark::State& state) {
15034
}
15135

15236
static void FP32MobileNetV1(benchmark::State& state) {
153-
BenchmarkInvoke(state, models::FP32MobileNetV1);
37+
xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV1);
15438
}
15539

15640
static void FP32MobileNetV2(benchmark::State& state) {
157-
BenchmarkInvoke(state, models::FP32MobileNetV2);
41+
xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV2);
15842
}
15943

16044
static void FP32MobileNetV3Large(benchmark::State& state) {
161-
BenchmarkInvoke(state, models::FP32MobileNetV3Large);
45+
xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV3Large);
16246
}
16347

16448
static void FP32MobileNetV3Small(benchmark::State& state) {
165-
BenchmarkInvoke(state, models::FP32MobileNetV3Small);
49+
xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV3Small);
16650
}
16751

16852
static void FP16MobileNetV1(benchmark::State& state) {
169-
BenchmarkInvoke(state, models::FP32MobileNetV1,
170-
XNN_FLAG_FORCE_FP16_INFERENCE);
53+
xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV1,
54+
XNN_FLAG_FORCE_FP16_INFERENCE);
17155
}
17256

17357
static void FP16MobileNetV2(benchmark::State& state) {
174-
BenchmarkInvoke(state, models::FP32MobileNetV2,
175-
XNN_FLAG_FORCE_FP16_INFERENCE);
58+
xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV2,
59+
XNN_FLAG_FORCE_FP16_INFERENCE);
17660
}
17761

17862
static void FP16MobileNetV3Large(benchmark::State& state) {
179-
BenchmarkInvoke(state, models::FP32MobileNetV3Large,
180-
XNN_FLAG_FORCE_FP16_INFERENCE);
63+
xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV3Large,
64+
XNN_FLAG_FORCE_FP16_INFERENCE);
18165
}
18266

18367
static void FP16MobileNetV3Small(benchmark::State& state) {
184-
BenchmarkInvoke(state, models::FP32MobileNetV3Small,
185-
XNN_FLAG_FORCE_FP16_INFERENCE);
68+
xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV3Small,
69+
XNN_FLAG_FORCE_FP16_INFERENCE);
18670
}
18771

18872
static void QD8Attention(benchmark::State& state) {
18973
models::QD8AttentionWeights weights;
190-
BenchmarkInvoke(state, [&state, &weights]() {
74+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state, &weights]() {
19175
return models::QD8Attention(FLAGS_batch_size, state.range(0),
19276
state.range(1), state.range(2), state.range(3),
19377
weights);
19478
});
19579
}
19680

19781
static void QS8MobileNetV2(benchmark::State& state) {
198-
BenchmarkInvoke(state, models::QS8MobileNetV2);
82+
xnnpack::ModelRuntime::BenchmarkInvoke(state, models::QS8MobileNetV2);
19983
}
20084

20185
static void FP32Elementwise(benchmark::State& state) {
202-
BenchmarkInvoke(state, [&state]() {
86+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
20387
return models::FP32Elementwise(/*batch_size=*/state.range(0),
20488
/*num_elements=*/state.range(1),
20589
/*depth=*/state.range(2));
20690
});
20791
}
20892

20993
static void FP32LayerNorm(benchmark::State& state) {
210-
BenchmarkInvoke(state, [&state]() {
94+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
21195
return models::FP32LayerNorm(state.range(0), state.range(1), state.range(2),
21296
state.range(3));
21397
});
21498
}
21599

216100
static void FP32L2Norm(benchmark::State& state) {
217-
BenchmarkInvoke(state, [&state]() {
101+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
218102
return models::FP32L2Norm(state.range(0), state.range(1), state.range(2),
219103
state.range(3));
220104
});
221105
}
222106

223107
static void FP32SoftmaxDecomp(benchmark::State& state) {
224-
BenchmarkInvoke(state, [&state]() {
108+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
225109
return models::FP32Softmax(state.range(0), state.range(1), state.range(2),
226110
state.range(3), /*use_softmax=*/false);
227111
});
228112
}
229113

230114
static void FP32SoftmaxFused(benchmark::State& state) {
231-
BenchmarkInvoke(state, [&state]() {
115+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
232116
return models::FP32Softmax(state.range(0), state.range(1), state.range(2),
233117
state.range(3), /*use_softmax=*/true);
234118
});
235119
}
236120

237121
static void FP32DepthwiseSeparable(benchmark::State& state) {
238122
models::FP32DepthwiseSeparableWeights weights;
239-
BenchmarkInvoke(state, [&state, &weights]() {
123+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state, &weights]() {
240124
return models::FP32DepthwiseSeparable(state.range(0), state.range(1),
241125
state.range(2), state.range(3),
242126
state.range(4), weights);
243127
});
244128
}
245129

246130
static void QD8TransformerBlock(benchmark::State& state) {
247-
BenchmarkInvoke(state, [&state]() {
131+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
248132
return models::QD8TransformerBlock(
249133
FLAGS_batch_size, /*sequence_length=*/state.range(0),
250134
/*embedding_dim=*/state.range(1), /*num_heads=*/state.range(2),
@@ -253,7 +137,7 @@ static void QD8TransformerBlock(benchmark::State& state) {
253137
}
254138

255139
static void FP32TransformerBlock(benchmark::State& state) {
256-
BenchmarkInvoke(state, [&state]() {
140+
xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
257141
return models::FP32TransformerBlock(
258142
FLAGS_batch_size, /*sequence_length=*/state.range(0),
259143
/*embedding_dim=*/state.range(1), /*num_heads=*/state.range(2),
@@ -262,7 +146,7 @@ static void FP32TransformerBlock(benchmark::State& state) {
262146
}
263147

264148
static void FP16TransformerBlock(benchmark::State& state) {
265-
BenchmarkInvoke(
149+
xnnpack::ModelRuntime::BenchmarkInvoke(
266150
state,
267151
[&state]() {
268152
return models::FP32TransformerBlock(

0 commit comments

Comments
 (0)