openvinotoolkit · virajwad · Dec 2, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 20, 2024
@@ -0,0 +1,109 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "core/operator_set.hpp"
+#include "exceptions.hpp"
+#include "openvino/frontend/exception.hpp"
+#include "openvino/op/subtract.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/convert.hpp"
+#include "utils/common.hpp"
+
+using namespace ov::op;
+
+namespace ov {
+namespace frontend {
+namespace onnx {
+namespace com_microsoft {
+namespace opset_1 {
+ov::OutputVector dynamic_quantize_matmul(const ov::frontend::onnx::Node& node) {
+    // Original Documentation:
+    // https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.DynamicQuantizeMatMul
+
+    // A, B, and b_scale are required inputs. b_zero_point and bias are optional inputs
+    common::default_op_checks(node, 3);
+
+    const auto inputs = node.get_ov_inputs();
+    const auto& A = inputs[0]; // required
+    const auto& B = inputs[1]; // required
+    const auto& b_scale = inputs[2]; // required
+
+    ov::Output<ov::Node> b_zero_point; // optional, input[3]
+    ov::Output<ov::Node> bias; // optional, input[4]
+
+    // Constrain input matrix A to T1 type (float tensor)
+    auto element_type_A = A.get_element_type();
+    CHECK_VALID_NODE(node,
+                     element_type_A == ov::element::f32,
+                     "Unsupported input A type, accepted FP32 but got: ",
+                     element_type_A);
+
+    // Constrain input matrix B to T2 type (int8 tensor, uint8 tensor)
+    auto element_type_B = B.get_element_type();
+    CHECK_VALID_NODE(node,
+                     element_type_B == ov::element::u8 || element_type_B == ov::element::i8,
+                     "Unsupported input B type, accepted UINT8, INT8 but got: ",
+                     element_type_B);
+
+    // Constrain input b_scale to T1 type (float tensor)
+    auto element_type_b_scale = b_scale.get_element_type();
+    CHECK_VALID_NODE(node,
+                     element_type_b_scale == ov::element::f32,
+                     "Unsupported input b_scale type, accepted FP32 but got: ",
+                     element_type_b_scale);
+
+    // Check for the optional inputs
+    if(inputs.size() > 3) {
+        // Constrain input b_zero_point to T2 type (int8 tensor, uint8 tensor)
+        b_zero_point = inputs[3];
+        auto element_type_b_zero_point = b_zero_point.get_element_type();
+        CHECK_VALID_NODE(node,
+                         element_type_b_zero_point == ov::element::u8 || element_type_b_zero_point == ov::element::i8,
+                         "Unsupported input b_zero_point type, accepted UINT8, INT8 but got: ",
+                         element_type_b_zero_point);
+    }
+
+    if(inputs.size() > 4) {
+        // Constrain input bias to T1 type (float tensor)
+        bias = inputs[4];
+        auto element_type_bias = bias.get_element_type();
+        CHECK_VALID_NODE(node,
+                         element_type_bias == ov::element::f32,
+                         "Unsupported input bias type, accepted FP32 but got: ",
+                         element_type_bias);
+    }
+
+    // At time of writing, ov::MatMul does not support int8/uint8 types. To get the correct output, we need to dequantize B. 
+    // Technically this does not do DynamicQuantization, but is required for correct output of the operator. It will implement A * B_dequantized + bias
+    // According to ONNX RT docs, they do linear quantization shown here https://tomwildenhain-microsoft.github.io/onnxruntime/docs/performance/quantization.html
+    // B_dequantized = (B - b_zero_point) * b_scale
+
+    ov::Output<ov::Node> B_dequantized = std::make_shared<v0::Convert>(B, b_scale.get_element_type());
+    b_zero_point = std::make_shared<v0::Convert>(b_zero_point, b_scale.get_element_type());
+    B_dequantized = std::make_shared<v1::Subtract>(B_dequantized, b_zero_point);
+    B_dequantized = std::make_shared<v1::Multiply>(B_dequantized, b_scale);
+
+    // A, B are N-dimensional matrices. According to example ONNX models for this operator, the suboperations pass input A/B such that B's shape is already transposed.
+    // E.g. https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.onnx
+    // So here in ov::MatMul we will not do any transpose
+
+    auto result = std::make_shared<v0::MatMul>(A, B_dequantized, false, false);
+
+    // Adding bias if required
+    if (bias.get_node_shared_ptr()) {
+        return {std::make_shared<v1::Add>(result, bias)};
+    }
+
+    return {result};
+}  // func end
+
+ONNX_OP("DynamicQuantizeMatMul", OPSET_SINCE(1), com_microsoft::opset_1::dynamic_quantize_matmul, MICROSOFT_DOMAIN);
+
+}  // namespace opset_1
+}  // namespace com_microsoft
+}  // namespace onnx
+}  // namespace frontend
+}  // namespace ov
@@ -0,0 +1,91 @@
+ir_version: 3
+producer_name: "OpenVINO ONNX Frontend"
+graph {
+  node {
+    input: "A"
+    input: "B"
+    input: "b_scale"
+    input: "b_zero_point"
+    output: "Y"
+    op_type: "DynamicQuantizeMatMul"
+    domain: "com.microsoft"
+  }
+  name: "test_dqmm_example"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 3
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "b_scale"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+        dim {
+          dim_value: 1
+        }
+        }
+      }
+    }
+  }
+  input {
+    name: "b_zero_point"
+    type {
+      tensor_type {
+        elem_type: 3
+        shape {
+        dim {
+          dim_value: 1
+        }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 1
+}
@@ -1357,6 +1357,33 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_quickgelu) {
     }
 }
 
+OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_dynamic_quantize_matmul) {
+    const auto model = convert_model("com.microsoft/dynamic_quantize_matmul.onnx");
+    auto test_case = ov::test::TestCase(model, s_device);
+
+    // Fill test case here
+    const std::vector<float> input_A{1.29292f, 2.47473f, 3.291903f, 4.1728945f, 5.213912f, 6.1293125f};
+    const std::vector<int8_t> input_B{-2, 29, 61, 61, 29, 125};
+    const std::vector<float> b_scale{0.003137f};
+    const std::vector<int8_t> b_zero_point{-34};
+
+    const std::vector<float> expected{0.8681802f, 0.7458673f, 1.6218146f, 1.5770973f, 1.4774824f, 3.0677009f, 2.3504133f, 2.2423527f, 4.611995f};
+
+    // add_input needs to be called in order of model inputs (order matters)
+    test_case.add_input<float>(Shape{3,2}, input_A);
+    test_case.add_input<int8_t>(Shape{2,3}, input_B);
+    test_case.add_input<float>(Shape{1}, b_scale);
+    test_case.add_input<int8_t>(Shape{1}, b_zero_point);
+
+    test_case.add_expected_output<float>(Shape{3,3}, expected);
+
+    if (std::string("${BACKEND_NAME}") == std::string("IE_GPU")) {
+        test_case.run_with_tolerance_as_fp(0.0055f);
+    } else {
+        test_case.run_with_tolerance_as_fp(0.0055f);
+    }
+}
+
 OPENVINO_TEST(${BACKEND_NAME}, onnx_model_skip_simplified_layer_normalization) {
     const auto model = convert_model("com.microsoft/skip_simplified_layer_normalization.onnx");
     auto test_case = ov::test::TestCase(model, s_device);