Skip to content

Add XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC #8077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,7 @@ xnnpack_cc_library(
":logging",
":params",
"//src/configs:hardware_config",
"@KleidiAI//:common",
"@pthreadpool",
],
)
Expand Down
5 changes: 5 additions & 0 deletions include/xnnpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,11 @@ extern "C" {
/// may fail with an error.
#define XNN_FLAG_NO_BROADCAST 0x00001000

/// This flag indicates that XNNPACK should attempt to produce numerically consistent results from a specific
/// build of XNNPACK. This causes XNNPACK to avoid using faster codepaths that are numerically inconsistent
/// with any other codepath that could be used in the same compiled XNNPACK library.
#define XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC 0x00002000

/// The number of entries in an array of xnn_quantization_params that XNNPACK may read beyond array bounds.
/// The caller must allocate at least this many extra xnn_quantization_params before passing the array to XNNPACK.
///
Expand Down
14 changes: 11 additions & 3 deletions test/subgraph/batch-matrix-multiply.cc
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ void FakeDynamicQuantize(const Tensor<quantized<Data>>& input, xnn_datatype) {}
template <typename Input, typename Output = Input>
void TestDynamicB(xnn_datatype convert_to = xnn_datatype_invalid) {
ReplicableRandomDevice rng;
std::bernoulli_distribution flag_dist(0.5);
std::uniform_int_distribution<> rank_dist{2, XNN_MAX_TENSOR_DIMS - 1};

ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));

Expand All @@ -196,15 +198,17 @@ void TestDynamicB(xnn_datatype convert_to = xnn_datatype_invalid) {
broadcast_extent_1(b_scales);

for (auto _ : FuzzTest(std::chrono::milliseconds(1000))) {
std::uniform_int_distribution<> rank_dist{2, XNN_MAX_TENSOR_DIMS - 1};
size_t input_a_rank = rank_dist(rng);
size_t input_b_rank = rank_dist(rng);
size_t output_rank = std::max(input_a_rank, input_b_rank);

uint32_t flags = 0;
if (rng() & 1) {
if (flag_dist(rng)) {
flags |= XNN_FLAG_TRANSPOSE_B;
}
if (flag_dist(rng)) {
flags |= XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC;
}

SubgraphTester subgraph(3);
const uint32_t input_a_id = 0;
Expand Down Expand Up @@ -294,17 +298,21 @@ TEST(BatchMatrixMultiplyBF16F32, dynamic_b) {
template <typename InputA, typename InputB, typename Output = InputA>
void TestStaticB(xnn_datatype convert_to = xnn_datatype_invalid) {
ReplicableRandomDevice rng;
std::bernoulli_distribution flag_dist(0.5);
std::uniform_int_distribution<> dim_dist{1, 100};
std::uniform_int_distribution<> rank_dist{2, XNN_MAX_TENSOR_DIMS - 1};

ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));

for (auto _ : FuzzTest(std::chrono::milliseconds(1000))) {
std::uniform_int_distribution<> rank_dist{2, XNN_MAX_TENSOR_DIMS - 1};
size_t input_a_rank = rank_dist(rng);
size_t input_b_rank = rank_dist(rng);
size_t output_rank = std::max(input_a_rank, input_b_rank);

uint32_t flags = 0;
if (flag_dist(rng)) {
flags |= XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC;
}

SubgraphTester subgraph(3);
const uint32_t input_a_id = 0;
Expand Down
12 changes: 10 additions & 2 deletions test/subgraph/fully-connected.cc
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ void TestStaticB(xnn_datatype convert_to = xnn_datatype_invalid,
divide_round_up(8, xnn_datatype_size_bits(datatype_of<Filter>()));

ReplicableRandomDevice rng;
std::bernoulli_distribution flag_dist(0.5);

ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));

Expand All @@ -364,9 +365,12 @@ void TestStaticB(xnn_datatype convert_to = xnn_datatype_invalid,
uint32_t flags = 0;
if (filter_channel_factor > 1) {
// Sub-byte datatypes don't support transposed weights
} else if (rng() & 1) {
} else if (flag_dist(rng)) {
flags |= XNN_FLAG_TRANSPOSE_WEIGHTS;
}
if (flag_dist(rng)) {
flags |= XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC;
}

// Make a random filter.
std::vector<size_t> filter_shape = {output_channels,
Expand Down Expand Up @@ -596,6 +600,7 @@ template <typename Input, typename Filter, typename Bias,
void TestDynamicB(xnn_datatype convert_to = xnn_datatype_invalid,
size_t block_size = no_blockwise) {
ReplicableRandomDevice rng;
std::bernoulli_distribution flag_dist(0.5);

ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));

Expand All @@ -617,9 +622,12 @@ void TestDynamicB(xnn_datatype convert_to = xnn_datatype_invalid,
const size_t rank = rank_dist(rng);

uint32_t flags = 0;
if (rng() & 1) {
if (flag_dist(rng)) {
flags |= XNN_FLAG_TRANSPOSE_WEIGHTS;
}
if (flag_dist(rng)) {
flags |= XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC;
}

float output_min = output_gen(rng);
float output_max = output_gen(rng);
Expand Down
8 changes: 4 additions & 4 deletions test/subgraph/subgraph-tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -411,10 +411,10 @@ SubgraphTester& SubgraphTester::AddBinary(xnn_binary_operator op,

SubgraphTester& SubgraphTester::AddUnary(xnn_unary_operator op,
xnn_unary_params* params,
uint32_t input_id,
uint32_t output_id) {
const xnn_status status = xnn_define_unary(subgraph_.get(), op, params,
input_id, output_id, /*flags=*/0);
uint32_t input_id, uint32_t output_id,
uint32_t flags) {
const xnn_status status =
xnn_define_unary(subgraph_.get(), op, params, input_id, output_id, flags);
EXPECT_EQ(status, xnn_status_success);
return *this;
}
Expand Down
3 changes: 2 additions & 1 deletion test/subgraph/subgraph-tester.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ class SubgraphTester {
uint32_t output_id);

SubgraphTester& AddUnary(xnn_unary_operator op, xnn_unary_params* params,
uint32_t input_id, uint32_t output_id);
uint32_t input_id, uint32_t output_id,
uint32_t flags = 0);

SubgraphTester& AddConvolution2D(ConvolutionParams params, uint32_t input_id,
uint32_t filter_id, uint32_t bias_id,
Expand Down
6 changes: 5 additions & 1 deletion test/subgraph/unary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <random>
#include <sstream>
#include <string>
#include <tuple>
Expand Down Expand Up @@ -70,6 +71,7 @@ void TestImpl(size_t rank, xnn_unary_operator op) {
const xnn_datatype datatype_out = xnn_datatype_of<Out>();

ReplicableRandomDevice rng;
std::bernoulli_distribution flag_dist(0.5);

// We want the total number of elements to be reasonable, so choose max_dim
// such that a random shape of rank `rank` produces this max size.
Expand All @@ -85,14 +87,16 @@ void TestImpl(size_t rank, xnn_unary_operator op) {
op_info->InputQuantizationParams(datatype_in);
xnn_quantization_params output_quantization =
op_info->OutputQuantizationParams(datatype_out);
const uint32_t flags =
flag_dist(rng) ? XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC : 0;

Interval domain = op_info->Domain(datatype_in);
xnn_unary_params params = op_info->DefaultParams();

SubgraphTester subgraph(3);
subgraph.AddInputTensor(rank, datatype_in, input_quantization, 0)
.AddOutputTensor(rank, datatype_out, output_quantization, 1)
.AddUnary(op, &params, 0, 1);
.AddUnary(op, &params, 0, 1, flags);
xnn_status status = subgraph.CreateRuntime();
ASSERT_EQ(status, xnn_status_success);

Expand Down
Loading