openvinotoolkit · WeldonWangwang · Feb 11, 2025 · Feb 13, 2025 · Mar 26, 2025 · Mar 31, 2025
@@ -0,0 +1,160 @@
+.. {#openvino_docs_ops_internal_MOE}
+
+MOE
+===
+
+
+.. meta::
+  :description: Learn about MOE - a basic block for the mixture of experts.
+
+**Versioned name**: *MOE*
+
+**Category**: *Sequence processing*
+
+**Short description**: *MOE* partially implements
+`Qwen3MoeSparseMoeBlock.forward <https://github.com/huggingface/transformers/blob/1fed6166c00b800330fcda8494f78cbcad8e4e3b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py#L235-L263>`__,
+omitting the `gate` operation.
+
+**Detailed description**:
+
+*MOE* provides functionality according to the following pseudo-code using torch:
+
+.. code-block:: py
+	:force:
+
+	def MOE(hidden_states, router_logits, attrs):
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, attrs.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=attrs.expert_num).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(attrs.expert_num):
+            expert_layer = attrs.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        return final_hidden_states
+
+
+**Attributes**
+
+* *topk*
+
+  * **Description**: The number of activated expert. Must be less than or equal to ``expert_num``.
+  * **Range of values**: a positive integer number
+  * **Type**: ``size_t``
+  * **Required**: *yes*
+
+* *expert_num*
+
+  * **Description**: The number of expert number.
+  * **Range of values**: a positive integer number
+  * **Type**: ``size_t``
+  * **Required**: *yes*
+
+* *hidden_size*
+
+  * **Description**: Feature size which is extracted from ``hidden_states``.
+  * **Range of values**: a positive integer number
+  * **Type**: ``size_t``
+  * **Required**: *yes*
+
+* *intermediate_size*
+
+  * **Description**: Intermediate size which is extracted from expert_layer mentioned in the pseudo-code.
+  * **Range of values**: a positive integer number
+  * **Type**: ``size_t``
+  * **Required**: *yes*
+
+* *group_size*
+
+  * **Description**: Weight compression group size which is extracted from expert_layer mentioned in the pseudo-code.
+  * **Range of values**: a greater than or equal to 0 integer number
+  * **Type**: ``size_t``
+  * **Required**: *no*
+
+* *weight_type*
+
+  * **Description**: Weight data type which are extracted from expert_layer mentioned in the pseudo-code.
+  * **Range of values**: "f16", "f32", "u8", "u4"
+  * **Required**: *yes*
+
+* *scale_type*
+
+  * **Description**: Scale data type which are extracted from expert_layer mentioned in the pseudo-code.
+  * **Range of values**: "f16", "dynamic"
+  * **Required**: *no*
+
+* *zp_type*
+
+  * **Description**: Zero point data type which are extracted from expert_layer mentioned in the pseudo-code.
+  * **Range of values**: "u8", "u4", "dynamic"
+  * **Required**: *no*
+
+* *gates/ups/downs*
+
+  * **Description**: Weight data which are extracted from expert_layer mentioned in the pseudo-code.
+  * **Type**: ``v0::Constant``
+  * **Required**: *yes*
+
+**Inputs**
+
+* **1**: ``hidden_states`` - 2 dimensional tensor of type *T* with the shape [batch, hidden_size]. **Required.**
+
+* **2**: ``router_logits`` - 2 dimensional tensor of type *T* with the shape [batch, expert_num]. **Required.**
+
+
+**Outputs**
+
+* **1**: Output tensor of the same shape and type as the ``hidden_states`` input tensor.
+
+**Types**
+
+* *T*: any floating point type.
+
+**Example**
+
+.. code-block:: xml
+   :force:
+
+		<layer id="5" name="moe_router" type="MOE" version="ie_internal_opset">
+			<data config.topk="2" config.expert_num="4" config.hidden_size="2048" config.intermediate_size="768" config.group_size="128" config.fused_router_logic="1" config.weight_type="u4" config.scale_type="f16" config.zp_type="u4" expert0_mlp0.element_type="u4" expert0_mlp0.shape="768, 16, 128" expert0_mlp1.element_type="f16" expert0_mlp1.shape="768, 16, 1" expert0_mlp2.element_type="u4" expert0_mlp2.shape="768, 16, 1" expert1_mlp0.element_type="u4" expert1_mlp0.shape="768, 16, 128" expert1_mlp1.element_type="f16" expert1_mlp1.shape="768, 16, 1" expert1_mlp2.element_type="u4" expert1_mlp2.shape="768, 16, 1" expert2_mlp0.element_type="u4" expert2_mlp0.shape="768, 16, 128" expert2_mlp1.element_type="f16" expert2_mlp1.shape="768, 16, 1" expert2_mlp2.element_type="u4" expert2_mlp2.shape="768, 16, 1" expert3_mlp0.element_type="u4" expert3_mlp0.shape="768, 16, 128" expert3_mlp1.element_type="f16" expert3_mlp1.shape="768, 16, 1" expert3_mlp2.element_type="u4" expert3_mlp2.shape="768, 16, 1" />
+			<input>
+				<port id="0" precision="FP32">
+					<dim>-1</dim>
+					<dim>2048</dim>
+				</port>
+				<port id="1" precision="FP32">
+					<dim>-1</dim>
+					<dim>4</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>-1</dim>
+					<dim>2048</dim>
+				</port>
+			</output>
+		</layer>
@@ -0,0 +1,91 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "openvino/core/node.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/op.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov::op::internal {
+///
+/// \brief MOE experts
+class TRANSFORMATIONS_API MOE : public ov::op::Op {
+public:
+    OPENVINO_OP("MOE", "ie_internal_opset");
+
+    MOE() = default;
+
+    struct Config {
+        size_t topk{};
+        size_t expert_num{};
+        size_t hidden_size{};
+        size_t intermediate_size{};
+        size_t group_size{};              // quantized group size, 0 for no group size. same for gate/up/down
+        ov::element::Type weight_type{};  // same for gate/up/down
+        ov::element::Type scale_type{};   // same for gate/up/down
+        ov::element::Type zp_type{};      // same for gate/up/down
+        bool operator==(const Config& rhs) const {
+            return std::tie(topk,
+                            expert_num,
+                            hidden_size,
+                            intermediate_size,
+                            group_size,
+                            weight_type,
+                            scale_type,
+                            zp_type) == std::tie(rhs.topk,
+                                                 rhs.expert_num,
+                                                 rhs.hidden_size,
+                                                 rhs.intermediate_size,
+                                                 rhs.group_size,
+                                                 rhs.weight_type,
+                                                 rhs.scale_type,
+                                                 rhs.zp_type);
+        }
+    };
+
+    // 0: weight, 1: scale, 2: zp
+    struct ConstsPerExpert {
+        std::array<std::shared_ptr<ov::op::v0::Constant>, 3> gates;
+        std::array<std::shared_ptr<ov::op::v0::Constant>, 3> ups;
+        std::array<std::shared_ptr<ov::op::v0::Constant>, 3> downs;
+    };
+    struct Attributes {
+        // expert config
+        Config config;
+        // expert weight/scale/zp
+        std::vector<ConstsPerExpert> consts;
+    };
+
+    MOE(const OutputVector& args, const Attributes& attrs);
+
+    const Config& get_config() const;
+    void set_config(const Config& config);
+    const std::vector<ConstsPerExpert>& get_consts() const {
+        return m_attrs.consts;
+    }
+
+    void add_consts(size_t expert_no, const ConstsPerExpert& consts) {
+        OPENVINO_ASSERT(expert_no == m_attrs.consts.size(),
+                        "MOE add_consts failed. Expected expert number: ",
+                        m_attrs.consts.size(),
+                        ", current: ",
+                        expert_no);
+        m_attrs.consts.push_back(consts);
+    }
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+private:
+    Attributes m_attrs;
+};
+
+}  // namespace ov::op::internal
@@ -0,0 +1,36 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+class TRANSFORMATIONS_API FuseMOEExpert;
+class TRANSFORMATIONS_API FuseMOERouter;
+class TRANSFORMATIONS_API FuseMOE;
+
+}  // namespace pass
+}  // namespace ov
+
+class ov::pass::FuseMOEExpert : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("FuseMOE");
+    FuseMOEExpert();
+};
+
+class ov::pass::FuseMOERouter : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("FuseMOERouter");
+    FuseMOERouter();
+};
+
+class ov::pass::FuseMOE : public ov::pass::ModelPass {
+public:
+    OPENVINO_MODEL_PASS_RTTI("FuseMOE");
+    bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
+};
@@ -0,0 +1,67 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ov_ops/moe.hpp"
+
+#include "itt.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+MOE::MOE(const OutputVector& args, const Attributes& attrs) : Op(args), m_attrs(attrs) {
+    constructor_validate_and_infer_types();
+}
+
+const MOE::Config& MOE::get_config() const {
+    return m_attrs.config;
+}
+
+void MOE::set_config(const Config& config) {
+    m_attrs.config = config;
+}
+
+std::shared_ptr<ov::Node> MOE::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(internal_MOE_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+
+    return std::make_shared<MOE>(new_args, m_attrs);
+}
+
+void MOE::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(internal_MOE_validate_and_infer_types);
+    OPENVINO_ASSERT(get_input_size() == 2, "MOE must have 2 inputs whereas it has ", get_input_size());
+
+    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+}
+
+bool MOE::visit_attributes(ov::AttributeVisitor& visitor) {
+    INTERNAL_OP_SCOPE(internal_MOE_visit_attributes);
+    visitor.start_structure("config");
+
+    visitor.on_attribute("topk", m_attrs.config.topk);
+    visitor.on_attribute("expert_num", m_attrs.config.expert_num);
+    visitor.on_attribute("hidden_size", m_attrs.config.hidden_size);
+    visitor.on_attribute("intermediate_size", m_attrs.config.intermediate_size);
+    visitor.on_attribute("group_size", m_attrs.config.group_size);
+    visitor.on_attribute("weight_type", m_attrs.config.weight_type);
+    visitor.on_attribute("scale_type", m_attrs.config.scale_type);
+    visitor.on_attribute("zp_type", m_attrs.config.zp_type);
+    visitor.finish_structure();
+    m_attrs.consts.resize(m_attrs.config.expert_num);
+    for (size_t i = 0; i < m_attrs.config.expert_num; i++) {
+        for (size_t j = 0; j < 3; j++) {
+            if (m_attrs.consts[i].gates[j]) {
+                visitor.start_structure("expert" + std::to_string(i) + "_mlp" + std::to_string(j));
+                m_attrs.consts[i].gates[j]->visit_attributes(visitor);
+                visitor.finish_structure();
+            }
+        }
+    }
+    return true;
+}
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov