Skip to content

Commit 48fd62f

Browse files
gonnetxnnpack-bot
authored andcommitted
Add subgraph benchmarks for variants of the fully-connected op.
PiperOrigin-RevId: 767275232
1 parent 2687503 commit 48fd62f

File tree

5 files changed

+243
-3
lines changed

5 files changed

+243
-3
lines changed

bench/subgraph/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ xnnpack_cxx_library(
2525
"fp32-mobilenet-v3-small.cc",
2626
"fp32-softmax.cc",
2727
"fp32-transformer.cc",
28+
"fully-connected.cc",
2829
"qd8-attention.cc",
2930
"qd8-transformer.cc",
3031
"qs8-mobilenet-v2.cc",

bench/subgraph/benchmark.cc

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <memory>
1515
#include <vector>
1616

17+
#include "bench/gemm.h"
1718
#include "bench/subgraph/models.h"
1819
#include "bench/utils.h"
1920
#include "include/xnnpack.h"
@@ -273,6 +274,34 @@ static void FP16TransformerBlock(benchmark::State& state) {
273274
XNN_FLAG_FORCE_FP16_INFERENCE);
274275
}
275276

277+
static void QD8FullyConnected(benchmark::State& state) {
278+
BenchmarkInvoke(state, [&state]() {
279+
return models::QD8FullyConnected(
280+
/*batch_size=*/state.range(0),
281+
/*input_channels=*/state.range(2), /*output_channels=*/state.range(1));
282+
});
283+
}
284+
285+
static void FP32FullyConnected(benchmark::State& state) {
286+
BenchmarkInvoke(state, [&state]() {
287+
return models::FP32FullyConnected(
288+
/*batch_size=*/state.range(0),
289+
/*input_channels=*/state.range(2), /*output_channels=*/state.range(1));
290+
});
291+
}
292+
293+
static void FP16FullyConnected(benchmark::State& state) {
294+
BenchmarkInvoke(
295+
state,
296+
[&state]() {
297+
return models::FP32FullyConnected(
298+
/*batch_size=*/state.range(0),
299+
/*input_channels=*/state.range(2),
300+
/*output_channels=*/state.range(1));
301+
},
302+
XNN_FLAG_FORCE_FP16_INFERENCE);
303+
}
304+
276305
static void AttentionArguments(benchmark::internal::Benchmark* b) {
277306
b->ArgNames({"T", "H", "N", "S"});
278307
b->Args({16, 25, 24, 4});
@@ -326,6 +355,9 @@ static void DepthwiseSeparableArguments(benchmark::internal::Benchmark* b) {
326355
static void TransformerBlockArguments(benchmark::internal::Benchmark* b) {
327356
b->ArgNames({"T", "D", "N", "H", "F"});
328357

358+
// Gemma3-1B parameters.
359+
b->Args({128, 1152, 4, 256, 6 * 1152});
360+
329361
// GeminiXXS parameters.
330362
b->Args({128, 1536, 6, 256, 8 * 1536});
331363

@@ -334,9 +366,6 @@ static void TransformerBlockArguments(benchmark::internal::Benchmark* b) {
334366

335367
// Gemma2-2B parameters.
336368
b->Args({128, 2304, 8, 256, 9216});
337-
338-
// Gemma3-1B parameters.
339-
b->Args({128, 1152, 4, 256, 6 * 1152});
340369
}
341370

342371
BENCHMARK(FP32Attention)
@@ -415,4 +444,19 @@ BENCHMARK(FP16TransformerBlock)
415444
->UseRealTime()
416445
->Apply(TransformerBlockArguments);
417446

447+
BENCHMARK(QD8FullyConnected)
448+
->Unit(benchmark::kMicrosecond)
449+
->UseRealTime()
450+
->Apply(FaceMeshFullGemmArguments);
451+
452+
BENCHMARK(FP32FullyConnected)
453+
->Unit(benchmark::kMicrosecond)
454+
->UseRealTime()
455+
->Apply(FaceMeshFullGemmArguments);
456+
457+
BENCHMARK(FP16FullyConnected)
458+
->Unit(benchmark::kMicrosecond)
459+
->UseRealTime()
460+
->Apply(FaceMeshFullGemmArguments);
461+
418462
XNN_BENCHMARK_MAIN();

bench/subgraph/fully-connected.cc

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// This source code is licensed under the BSD-style license found in the
4+
// LICENSE file in the root directory of this source tree.
5+
6+
#include <algorithm>
7+
#include <array>
8+
#include <cstddef>
9+
#include <cstdint>
10+
#include <functional>
11+
#include <iostream>
12+
#include <limits>
13+
#include <random>
14+
#include <vector>
15+
16+
#include "bench/subgraph/models.h"
17+
#include "include/xnnpack.h"
18+
19+
// align a size up to XNN_EXTRA_BYTES
20+
#define XNN_PAD_EXTRA_BYTES(s, t) \
21+
(((s) + XNN_EXTRA_BYTES / sizeof(t) - 1) & ~(XNN_EXTRA_BYTES / sizeof(t) - 1))
22+
23+
namespace models {
24+
25+
xnn_subgraph_t FP32FullyConnected(size_t batch_size, size_t input_channels,
26+
size_t output_channels) {
27+
xnn_status status;
28+
xnn_subgraph_t subgraph = nullptr;
29+
status = xnn_create_subgraph(/*num_external_values=*/2, 0, &subgraph);
30+
if (status != xnn_status_success) {
31+
std::cerr << "failed to create subgrpah" << std::endl;
32+
return nullptr;
33+
}
34+
35+
std::random_device random_device; // NOLINT(runtime/random_device)
36+
auto rng = std::mt19937(random_device());
37+
38+
uint32_t v0 = XNN_INVALID_VALUE_ID;
39+
std::array<size_t, 2> v0_dims = {{batch_size, input_channels}};
40+
status = xnn_define_tensor_value(
41+
subgraph, xnn_datatype_fp32, v0_dims.size(), v0_dims.data(),
42+
/*data=*/nullptr, 0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0);
43+
if (status != xnn_status_success) {
44+
std::cerr << "failed to create tensor v0" << std::endl;
45+
return nullptr;
46+
}
47+
48+
uint32_t v38 = XNN_INVALID_VALUE_ID;
49+
std::array<size_t, 2> v38_dims = {{batch_size, output_channels}};
50+
status = xnn_define_tensor_value(
51+
subgraph, xnn_datatype_fp32, v38_dims.size(), v38_dims.data(),
52+
/*data=*/nullptr, 1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v38);
53+
if (status != xnn_status_success) {
54+
std::cerr << "failed to create tensor v38" << std::endl;
55+
return nullptr;
56+
}
57+
58+
static std::vector<float> w42_data;
59+
w42_data.resize(XNN_PAD_EXTRA_BYTES(input_channels * output_channels, float));
60+
uint32_t w42 = XNN_INVALID_VALUE_ID;
61+
std::array<size_t, 2> w42_dims = {{output_channels, input_channels}};
62+
status = xnn_define_tensor_value(
63+
subgraph, xnn_datatype_fp32, w42_dims.size(), w42_dims.data(),
64+
/*data=*/w42_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w42);
65+
if (status != xnn_status_success) {
66+
std::cerr << "failed to create tensor w42" << std::endl;
67+
return nullptr;
68+
}
69+
70+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f),
71+
std::ref(rng));
72+
std::generate(w42_data.begin(), w42_data.end(), std::ref(f32rng));
73+
74+
status = xnn_define_fully_connected(
75+
subgraph,
76+
/*output_min=*/-std::numeric_limits<float>::infinity(),
77+
/*output_max=*/std::numeric_limits<float>::infinity(),
78+
/*input_id=*/v0,
79+
/*filter_id=*/w42,
80+
/*bias_id=*/XNN_INVALID_VALUE_ID,
81+
/*output_id=*/v38,
82+
/*flags=*/0);
83+
if (status != xnn_status_success) {
84+
std::cerr << "failed to create node #6" << std::endl;
85+
return nullptr;
86+
}
87+
88+
return subgraph;
89+
} // NOLINT(readability/fn_size)
90+
91+
xnn_subgraph_t QD8FullyConnected(size_t batch_size, size_t input_channels,
92+
size_t output_channels) {
93+
xnn_status status;
94+
xnn_subgraph_t subgraph = nullptr;
95+
status = xnn_create_subgraph(/*num_external_values=*/2, 0, &subgraph);
96+
if (status != xnn_status_success) {
97+
std::cerr << "failed to create subgrpah" << std::endl;
98+
return nullptr;
99+
}
100+
101+
std::random_device random_device; // NOLINT(runtime/random_device)
102+
auto rng = std::mt19937(random_device());
103+
104+
uint32_t v0 = XNN_INVALID_VALUE_ID;
105+
std::array<size_t, 2> v0_dims = {{batch_size, input_channels}};
106+
status = xnn_define_tensor_value(
107+
subgraph, xnn_datatype_fp32, v0_dims.size(), v0_dims.data(),
108+
/*data=*/nullptr, 0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &v0);
109+
if (status != xnn_status_success) {
110+
std::cerr << "failed to create tensor v0" << std::endl;
111+
return nullptr;
112+
}
113+
114+
uint32_t v1 = XNN_INVALID_VALUE_ID;
115+
std::array<size_t, 2> v1_dims = {{batch_size, input_channels}};
116+
status = xnn_define_dynamically_quantized_tensor_value(
117+
subgraph, xnn_datatype_qdint8, /*num_dims=*/v1_dims.size(),
118+
/*num_non_batch_dims=*/1, /*dims=*/v1_dims.data(),
119+
/*external_id=*/XNN_INVALID_VALUE_ID,
120+
/*flags=*/0, &v1);
121+
if (status != xnn_status_success) {
122+
std::cerr << "failed to create tensor v1" << std::endl;
123+
return nullptr;
124+
}
125+
126+
uint32_t v38 = XNN_INVALID_VALUE_ID;
127+
std::array<size_t, 2> v38_dims = {{batch_size, output_channels}};
128+
status = xnn_define_tensor_value(
129+
subgraph, xnn_datatype_fp32, v38_dims.size(), v38_dims.data(),
130+
/*data=*/nullptr, 1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &v38);
131+
if (status != xnn_status_success) {
132+
std::cerr << "failed to create tensor v38" << std::endl;
133+
return nullptr;
134+
}
135+
136+
static std::vector<int8_t> w42_data;
137+
w42_data.resize(
138+
XNN_PAD_EXTRA_BYTES(input_channels * output_channels, int8_t));
139+
uint32_t w42 = XNN_INVALID_VALUE_ID;
140+
std::array<size_t, 2> w42_dims = {{output_channels, input_channels}};
141+
static std::vector<float> w42_scale;
142+
w42_scale.resize(output_channels);
143+
{
144+
auto scalerng = std::bind(
145+
std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
146+
std::generate(w42_scale.begin(), w42_scale.end(), std::ref(scalerng));
147+
}
148+
status = xnn_define_channelwise_quantized_tensor_value(
149+
subgraph, xnn_datatype_qcint8,
150+
/*scale=*/w42_scale.data(), w42_dims.size(), 0, w42_dims.data(),
151+
/*data=*/w42_data.data(), XNN_INVALID_VALUE_ID, /*flags=*/0, &w42);
152+
if (status != xnn_status_success) {
153+
std::cerr << "failed to create tensor w42" << std::endl;
154+
return nullptr;
155+
}
156+
157+
auto qc8rng = std::bind(
158+
std::uniform_int_distribution<int>(std::numeric_limits<int8_t>::min(),
159+
std::numeric_limits<int8_t>::max()),
160+
std::ref(rng));
161+
std::generate(w42_data.begin(), w42_data.end(), std::ref(qc8rng));
162+
163+
status = xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr,
164+
/*input_id=*/v0, /*output_id=*/v1,
165+
/*flags=*/0);
166+
if (status != xnn_status_success) {
167+
std::cerr << "failed to create create convert " << std::endl;
168+
return nullptr;
169+
}
170+
171+
status = xnn_define_fully_connected(
172+
subgraph,
173+
/*output_min=*/-std::numeric_limits<float>::infinity(),
174+
/*output_max=*/std::numeric_limits<float>::infinity(),
175+
/*input_id=*/v1,
176+
/*filter_id=*/w42,
177+
/*bias_id=*/XNN_INVALID_VALUE_ID,
178+
/*output_id=*/v38,
179+
/*flags=*/0);
180+
if (status != xnn_status_success) {
181+
std::cerr << "failed to create node #6" << std::endl;
182+
return nullptr;
183+
}
184+
185+
return subgraph;
186+
} // NOLINT(readability/fn_size)
187+
188+
} // namespace models

bench/subgraph/models.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ xnn_subgraph_t FP32TransformerBlock(size_t batch_size, size_t sequence_length,
8282
size_t embedding_dim, size_t num_heads,
8383
size_t head_dim, size_t hidden_dim);
8484

85+
xnn_subgraph_t FP32FullyConnected(size_t batch_size, size_t input_channels,
86+
size_t output_channels);
87+
xnn_subgraph_t QD8FullyConnected(size_t batch_size, size_t input_channels,
88+
size_t output_channels);
8589
} // namespace models
8690

8791
#endif // THIRD_PARTY_XNNPACK_BENCH_MODELS_MODELS_H_

src/runtime.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "src/xnnpack/operator.h"
4040
#include "src/xnnpack/params.h"
4141
#include "src/xnnpack/subgraph.h"
42+
#include "src/subgraph/subgraph-utils.h"
4243
#include <pthreadpool.h>
4344

4445
enum xnn_status xnn_reshape_external_value(
@@ -543,6 +544,8 @@ enum xnn_status xnn_create_runtime_v4(
543544
xnn_log_error("failed to optimize subgraph");
544545
goto error;
545546
}
547+
548+
xnn_subgraph_log_info(subgraph);
546549

547550
status = xnn_status_out_of_memory;
548551

0 commit comments

Comments
 (0)