Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ONNX OP] DynamicQuantizeMatMul #28158

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "core/operator_set.hpp"
#include "exceptions.hpp"
#include "openvino/frontend/exception.hpp"
#include "openvino/op/subtract.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/op/matmul.hpp"
#include "openvino/op/add.hpp"
#include "openvino/op/convert.hpp"
#include "utils/common.hpp"

using namespace ov::op;

namespace ov {
namespace frontend {
namespace onnx {
namespace com_microsoft {
namespace opset_1 {
ov::OutputVector dynamic_quantize_matmul(const ov::frontend::onnx::Node& node) {
// Original Documentation:
// https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.DynamicQuantizeMatMul

// A, B, and b_scale are required inputs. b_zero_point and bias are optional inputs
common::default_op_checks(node, 3);

const auto inputs = node.get_ov_inputs();
const auto& A = inputs[0]; // required
const auto& B = inputs[1]; // required
const auto& b_scale = inputs[2]; // required

ov::Output<ov::Node> b_zero_point; // optional, input[3]
ov::Output<ov::Node> bias; // optional, input[4]

// Constrain input matrix A to T1 type (float tensor)
auto element_type_A = A.get_element_type();
CHECK_VALID_NODE(node,
element_type_A == ov::element::f32,
"Unsupported input A type, accepted FP32 but got: ",
element_type_A);

// Constrain input matrix B to T2 type (int8 tensor, uint8 tensor)
auto element_type_B = B.get_element_type();
CHECK_VALID_NODE(node,
element_type_B == ov::element::u8 || element_type_B == ov::element::i8,
"Unsupported input B type, accepted UINT8, INT8 but got: ",
element_type_B);

// Constrain input b_scale to T1 type (float tensor)
auto element_type_b_scale = b_scale.get_element_type();
CHECK_VALID_NODE(node,
element_type_b_scale == ov::element::f32,
"Unsupported input b_scale type, accepted FP32 but got: ",
element_type_b_scale);

// Check for the optional inputs
if(inputs.size() > 3) {
// Constrain input b_zero_point to T2 type (int8 tensor, uint8 tensor)
b_zero_point = inputs[3];
auto element_type_b_zero_point = b_zero_point.get_element_type();
CHECK_VALID_NODE(node,
element_type_b_zero_point == ov::element::u8 || element_type_b_zero_point == ov::element::i8,
"Unsupported input b_zero_point type, accepted UINT8, INT8 but got: ",
element_type_b_zero_point);
}

if(inputs.size() > 4) {
// Constrain input bias to T1 type (float tensor)
bias = inputs[4];
auto element_type_bias = bias.get_element_type();
CHECK_VALID_NODE(node,
element_type_bias == ov::element::f32,
"Unsupported input bias type, accepted FP32 but got: ",
element_type_bias);
}

// At time of writing, ov::MatMul does not support int8/uint8 types. To get the correct output, we need to dequantize B.
// Technically this does not do DynamicQuantization, but is required for correct output of the operator. It will implement A * B_dequantized + bias
// According to ONNX RT docs, they do linear quantization shown here https://tomwildenhain-microsoft.github.io/onnxruntime/docs/performance/quantization.html
// B_dequantized = (B - b_zero_point) * b_scale

ov::Output<ov::Node> B_dequantized = std::make_shared<v0::Convert>(B, b_scale.get_element_type());
b_zero_point = std::make_shared<v0::Convert>(b_zero_point, b_scale.get_element_type());
B_dequantized = std::make_shared<v1::Subtract>(B_dequantized, b_zero_point);
B_dequantized = std::make_shared<v1::Multiply>(B_dequantized, b_scale);

// A, B are N-dimensional matrices. According to example ONNX models for this operator, the suboperations pass input A/B such that B's shape is already transposed.
// E.g. https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.onnx
// So here in ov::MatMul we will not do any transpose

auto result = std::make_shared<v0::MatMul>(A, B_dequantized, false, false);

// Adding bias if required
if (bias.get_node_shared_ptr()) {
return {std::make_shared<v1::Add>(result, bias)};
}

return {result};
} // func end

ONNX_OP("DynamicQuantizeMatMul", OPSET_SINCE(1), com_microsoft::opset_1::dynamic_quantize_matmul, MICROSOFT_DOMAIN);

} // namespace opset_1
} // namespace com_microsoft
} // namespace onnx
} // namespace frontend
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
ir_version: 3
producer_name: "OpenVINO ONNX Frontend"
graph {
node {
input: "A"
input: "B"
input: "b_scale"
input: "b_zero_point"
output: "Y"
op_type: "DynamicQuantizeMatMul"
domain: "com.microsoft"
}
name: "test_dqmm_example"
input {
name: "A"
type {
tensor_type {
elem_type: 1
shape {
dim {
dim_value: 3
}
dim {
dim_value: 2
}
}
}
}
}
input {
name: "B"
type {
tensor_type {
elem_type: 3
shape {
dim {
dim_value: 2
}
dim {
dim_value: 3
}
}
}
}
}
input {
name: "b_scale"
type {
tensor_type {
elem_type: 1
shape {
dim {
dim_value: 1
}
}
}
}
}
input {
name: "b_zero_point"
type {
tensor_type {
elem_type: 3
shape {
dim {
dim_value: 1
}
}
}
}
}
output {
name: "Y"
type {
tensor_type {
elem_type: 1
shape {
dim {
dim_value: 3
}
dim {
dim_value: 3
}
}
}
}
}
}
opset_import {
version: 1
}
27 changes: 27 additions & 0 deletions src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1357,6 +1357,33 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_quickgelu) {
}
}

OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_dynamic_quantize_matmul) {
const auto model = convert_model("com.microsoft/dynamic_quantize_matmul.onnx");
auto test_case = ov::test::TestCase(model, s_device);

// Fill test case here
const std::vector<float> input_A{1.29292f, 2.47473f, 3.291903f, 4.1728945f, 5.213912f, 6.1293125f};
const std::vector<int8_t> input_B{-2, 29, 61, 61, 29, 125};
const std::vector<float> b_scale{0.003137f};
const std::vector<int8_t> b_zero_point{-34};

const std::vector<float> expected{0.8681802f, 0.7458673f, 1.6218146f, 1.5770973f, 1.4774824f, 3.0677009f, 2.3504133f, 2.2423527f, 4.611995f};

// add_input needs to be called in order of model inputs (order matters)
test_case.add_input<float>(Shape{3,2}, input_A);
test_case.add_input<int8_t>(Shape{2,3}, input_B);
test_case.add_input<float>(Shape{1}, b_scale);
test_case.add_input<int8_t>(Shape{1}, b_zero_point);

test_case.add_expected_output<float>(Shape{3,3}, expected);

if (std::string("${BACKEND_NAME}") == std::string("IE_GPU")) {
test_case.run_with_tolerance_as_fp(0.0055f);
} else {
test_case.run_with_tolerance_as_fp(0.0055f);
}
}

OPENVINO_TEST(${BACKEND_NAME}, onnx_model_skip_simplified_layer_normalization) {
const auto model = convert_model("com.microsoft/skip_simplified_layer_normalization.onnx");
auto test_case = ov::test::TestCase(model, s_device);
Expand Down
Loading