From adeb32b0261aa608bc6d69820b1110636a6c4fa1 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 5 Jun 2025 15:10:10 +0800 Subject: [PATCH 01/19] draft: enable gpu to support dynamic customer op. Signed-off-by: xiping.yan --- .../intel_gpu/graph/kernel_impl_params.hpp | 5 +++ .../primitives/custom_gpu_primitive.hpp | 16 ++++++--- .../src/graph/custom_gpu_primitive.cpp | 22 ++++++++++++ .../src/graph/impls/ocl/custom_primitive.cpp | 15 ++++++-- .../graph/include/custom_gpu_primitive_inst.h | 30 +++++++++++++++- .../intel_gpu/src/plugin/ops/custom.cpp | 34 ++++++++++++------- 6 files changed, 102 insertions(+), 20 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp index bb4bc6bed2ecea..e887b8ea09fcd0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp @@ -47,6 +47,8 @@ struct kernel_impl_params final { size_t unique_id; bool _can_be_optimized = false; bool _runtime_skippable = false; + std::vector custom_op_dynamic_gws; + std::vector custom_op_dynamic_lws; std::vector input_layouts; std::vector output_layouts; std::vector input_offsets; @@ -136,6 +138,9 @@ struct kernel_impl_params final { return output_layouts[idx]; } + size_t get_input_layout_size() const { + return input_layouts.size(); + } bool has_fused_primitives() const { return !fused_desc.empty(); } diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp index e086cfb13dadbe..959e3a1e088b3f 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp @@ -47,6 +47,10 @@ struct custom_gpu_primitive : public primitive_base { } }; + static void update_work_group_size(const std::shared_ptr& op) { + std::cout << "== update_work_group_size...." << std::endl; + } + /// @brief Constructs custom_gpu_primitive primitive /// @param id This primitive id. /// @param input Input primitive ids. @@ -65,7 +69,8 @@ struct custom_gpu_primitive : public primitive_base { const std::string& build_options, const layout& output_layout, const std::vector& gws = {}, - const std::vector& lws = {}) + const std::vector& lws = {}, + const std::shared_ptr& op = nullptr) : primitive_base(id, inputs, 1, {optional_data_type()}, {output_layout.data_padding}), kernel_entry_point(kernel_entry_point), kernel_arguments(kernel_arguments), @@ -73,7 +78,8 @@ struct custom_gpu_primitive : public primitive_base { output_layout(output_layout), gws(gws.size() ? gws : std::vector{output_layout.count()}), lws(lws), - kernels_code(kernels_code) {} + kernels_code(kernels_code), + op(op) {} /// @brief The name of the entry point function in the kernel const std::string kernel_entry_point; @@ -84,11 +90,13 @@ struct custom_gpu_primitive : public primitive_base { /// @brief The output layout declared by the primitive const layout output_layout; /// @brief The global working sizes - const std::vector gws; + std::vector gws; /// @brief The local working sizes - const std::vector lws; + std::vector lws; /// @brief Source code for the kernel const primitive_id_arr kernels_code; + /// @brief Original IR op + const std::shared_ptr op; size_t hash() const override { size_t seed = primitive::hash(); diff --git a/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp b/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp index 4873c272f800fa..0c5fdae43ae850 100644 --- a/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp @@ -29,6 +29,28 @@ std::string custom_gpu_primitive_inst::to_string(custom_gpu_primitive_node const return primitive_description.str(); } +kernel_impl_params custom_gpu_primitive_inst::get_fake_aligned_params(kernel_impl_params const& orig_impl_param) { + auto updated_param = std::move(orig_impl_param); + const auto& orig_input_layout = orig_impl_param.get_input_layout(); + const auto& orig_output_layout = orig_impl_param.get_output_layout(); + OPENVINO_ASSERT(orig_input_layout.is_static() && orig_output_layout.is_static(), + "in/out layouts should be static for fake alignment!"); + + auto output_shape = orig_output_layout.get_partial_shape().to_shape(); + + // auto op = std::static_pointer_cast(updated_param.desc); + updated_param.custom_op_dynamic_gws = output_shape; + + custom_gpu_primitive::update_work_group_size(std::shared_ptr()); + + // updated_param.output_layouts[0] = orig_output_layout.clone_with_other_shape(output_shape); + // std::cout << "Apply fake alignment: input(" << orig_input_layout.to_short_string() << " -> " + // << updated_param.input_layouts[0].to_short_string() << "), output(" << orig_output_layout.to_short_string() << " -> " + // << updated_param.output_layouts[0].to_short_string() << ")\n"; + + return updated_param; +} + custom_gpu_primitive_inst::typed_primitive_inst(network& network, custom_gpu_primitive_node const& node) : parent(network, node) {} } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index 4aeadc7a297da3..41522367c1cc95 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -71,6 +71,17 @@ struct custom_gpu_primitive_impl : typed_primitive_impl { return {kernels_cache.get_cached_kernel_id(_kernels[0])}; } + void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { + // OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed."); + auto& kernel_vec = kernels.begin()->second; + _kernels.clear(); + _kernels.resize(kernel_vec.size()); + for (auto& k : kernel_vec) { + auto sub_kernel_idx = k.second; + _kernels[sub_kernel_idx] = k.first; + } + } + void set_arguments_impl(custom_gpu_primitive_inst& instance) override { auto& stream = instance.get_network().get_stream(); kernel_arguments_data args; @@ -217,7 +228,7 @@ static std::string get_jit_constant(const custom_gpu_primitive_node& outer, cons const auto primitive = outer.get_primitive().get(); mem_consts.AddConstants({ - kernel_selector::MakeJitConstant("GLOBAL_WORKSIZE", primitive->gws), + kernel_selector::MakeJitConstant("GLOBAL_WORKSIZE", impl_param.custom_op_dynamic_gws.size() > 0 ? impl_param.custom_op_dynamic_gws : primitive->gws), kernel_selector::MakeJitConstant("LOCAL_WORKSIZE", primitive->lws), }); @@ -248,7 +259,7 @@ static std::unique_ptr create(const custom_gpu_primitive_node& a cl_kernel->code.kernelString->str += s + "\n"; } - cl_kernel->params.workGroups.global = primitive->gws; + cl_kernel->params.workGroups.global = impl_param.custom_op_dynamic_gws.size() > 0 ? impl_param.custom_op_dynamic_gws : primitive->gws; cl_kernel->params.workGroups.local = primitive->lws; for (const auto& p : primitive->kernel_arguments) { diff --git a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h index 365fdc774b8f54..ecc630e86c402c 100644 --- a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h @@ -5,6 +5,7 @@ #pragma once #include "intel_gpu/primitives/custom_gpu_primitive.hpp" #include "primitive_inst.h" +#include "openvino/op/parameter.hpp" #include @@ -19,12 +20,37 @@ class typed_primitive_inst : public typed_primitive_inst_b public: template - static std::vector calc_output_layouts(custom_gpu_primitive_node const& /*node*/, const kernel_impl_params& impl_param) { + static std::vector calc_output_layouts(custom_gpu_primitive_node const& node, const kernel_impl_params& impl_param) { assert(static_cast(impl_param.desc->output_data_types[0]) == false && "Output data type forcing is not supported for " "custom_gpu_primitive_node!"); layout output_layout = impl_param.typed_desc()->output_layout; + bool is_dynamic = false; + const auto inp_sz = impl_param.get_input_layout_size(); + for (size_t i = 0; i < inp_sz; i++) { + if (impl_param.get_input_layout(i).is_dynamic()) { + is_dynamic = true; + break; + } + } + + if (!is_dynamic && output_layout.is_dynamic()) { + ov::OutputVector new_inputs; + for (size_t i = 0; i < inp_sz; i++) { + auto dt = impl_param.get_input_layout(i).data_type; + // std::make_shared(inType, inputDynamicShapes[0]) + auto input = std::make_shared(dt, impl_param.get_input_layout(i).get_shape()); + new_inputs.emplace_back(input); + } + + auto op = impl_param.typed_desc()->op; + auto new_op = op->clone_with_new_inputs(new_inputs); + new_op->validate_and_infer_types(); + auto new_outp_shape = new_op->get_output_shape(0); + output_layout.set_partial_shape(new_outp_shape); + } + // if the output layout format was set to any, it means the layer output format will be the same as the first input if (output_layout.format == format::any) { output_layout.format = impl_param.get_input_layout().format; @@ -48,6 +74,8 @@ class typed_primitive_inst : public typed_primitive_inst_b static std::string to_string(custom_gpu_primitive_node const& node); + static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param); + public: typed_primitive_inst(network& network, custom_gpu_primitive_node const& node); }; diff --git a/src/plugins/intel_gpu/src/plugin/ops/custom.cpp b/src/plugins/intel_gpu/src/plugin/ops/custom.cpp index 84c54da01786c6..e11a848e3f4e60 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/custom.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/custom.cpp @@ -169,23 +169,30 @@ void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr& op, Cust const std::string layerTitle("\n// Layer " + op->get_friendly_name() + " using Custom Layer " + customLayer->Name() + "\n"); const std::string defineTitle("// Custom Layer User Defines\n"); - auto dims = op->get_output_shape(0); - size_t N = (dims.size() > 0) ? dims[0] : 1; - size_t C = (dims.size() > 1) ? dims[1] : 1; - size_t H = (dims.size() > 2) ? dims[2] : 1; - size_t W = (dims.size() > 3) ? dims[3] : 1; - cldnn::tensor outputTensor = cldnn::tensor(cldnn::batch(N), cldnn::feature(C), cldnn::spatial(W, H)); + auto dims = op->get_output_partial_shape(0); + std::cout << "CustomOp output dims=" << dims << ", dims.size()=" << dims.size() << std::endl; - cldnn::layout outputLayout = cldnn::layout(cldnn::element_type_to_data_type(op->get_output_element_type(0)), outputFormat, outputTensor); + size_t N = (dims.size() > 0) ? dims[0].is_dynamic() ? -1 : dims[0].get_length() : 1; + size_t C = (dims.size() > 1) ? dims[1].is_dynamic() ? -1 : dims[1].get_length() : 1; + size_t H = (dims.size() > 2) ? dims[2].is_dynamic() ? -1 : dims[2].get_length() : 1; + size_t W = (dims.size() > 3) ? dims[3].is_dynamic() ? -1 : dims[3].get_length() : 1; + + cldnn::layout outputLayout; + if (dims.is_dynamic()) { + outputLayout = cldnn::layout(dims, cldnn::element_type_to_data_type(op->get_output_element_type(0)), outputFormat); + } else { + cldnn::tensor outputTensor = cldnn::tensor(cldnn::batch(N), cldnn::feature(C), cldnn::spatial(W, H)); + outputLayout = cldnn::layout(cldnn::element_type_to_data_type(op->get_output_element_type(0)), outputFormat, outputTensor); + } // evaluate work sizes rules std::vector gws, lws; // assume output tensor is dimension source by default - int batchDim = outputTensor.batch[0]; - int featureDim = outputTensor.feature[0]; - int yDim = outputTensor.spatial[1]; - int xDim = outputTensor.spatial[0]; + int batchDim = N; + int featureDim = C; + int yDim = H; + int xDim = W; int iidx = customLayer->InputDimSourceIndex(); std::string genericLayerName = layer_type_name_ID(op); @@ -227,7 +234,8 @@ void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr& op, Cust customLayer->CompilerOptions(), outputLayout, gws, - lws); + lws, + op); p.add_primitive(*op, customPrim); auto prevLayerName = genericLayerName; @@ -236,7 +244,7 @@ void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr& op, Cust auto reorderPrimName = genericLayerName + ProgramBuilder::m_postCustomLayerTag; p.add_primitive(*op, cldnn::reorder(reorderPrimName, cldnn::input_info(genericLayerName), - cldnn::format::get_default_format(op->get_output_shape(0).size()), + cldnn::format::get_default_format(op->get_output_partial_shape(0).size()), customPrim.output_layout.data_type)); prevLayerName = reorderPrimName; } From bf563f3fb4fa8b49d88c16da25d447f2ba982586 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 5 Jun 2025 20:20:51 +0800 Subject: [PATCH 02/19] wrapper calc work size for dynamic shape to update gws. Signed-off-by: xiping.yan --- .../primitives/custom_gpu_primitive.hpp | 70 +++++++++++++++++-- .../src/graph/custom_gpu_primitive.cpp | 28 +++++--- .../src/graph/impls/ocl/custom_primitive.cpp | 2 +- .../intel_gpu/src/plugin/ops/custom.cpp | 52 +++++--------- 4 files changed, 101 insertions(+), 51 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp index 959e3a1e088b3f..1a26039c62c962 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp @@ -5,6 +5,7 @@ #pragma once #include "primitive.hpp" #include "intel_gpu/runtime/memory.hpp" +#include "intel_gpu/plugin/simple_math.hpp" #include #include @@ -47,8 +48,58 @@ struct custom_gpu_primitive : public primitive_base { } }; - static void update_work_group_size(const std::shared_ptr& op) { - std::cout << "== update_work_group_size...." << std::endl; + static void update_work_group_size(const ov::PartialShape& dims, + int iidx, + const ov::PartialShape& inputDims, + const std::vector& globalSizeRules, + const std::vector& localSizeRules, + std::vector& gws, + std::vector& lws) { +#define GetDim(DIM) DIM.is_dynamic() ? -1 : DIM.get_length() + + gws.clear(); + lws.clear(); + + int batchDim = 0, featureDim = 0, yDim = 0, xDim = 0; + // if input index is greater than -1, take dimension from input + if (iidx >= 0) { + xDim = static_cast(GetDim(inputDims[inputDims.size() - 1])); + yDim = dims.size() > 1 ? static_cast(GetDim(inputDims[inputDims.size() - 2])) : 0; + featureDim = dims.size() > 2 ? static_cast(GetDim(inputDims[inputDims.size() - 3])) : 0; + batchDim = dims.size() > 3 ? static_cast(GetDim(inputDims[inputDims.size() - 4])) : 0; + } else { + batchDim = (dims.size() > 0) ? GetDim(dims[0]) : 1; + featureDim = (dims.size() > 1) ? GetDim(dims[1]) : 1; + yDim = (dims.size() > 2) ? GetDim(dims[2]) : 1; + xDim = (dims.size() > 3) ? GetDim(dims[3]) : 1; + } + const std::map vars = { + {'b', batchDim}, {'B', batchDim}, + {'f', featureDim}, {'F', featureDim}, + {'y', yDim}, {'Y', yDim}, + {'x', xDim}, {'X', xDim}, + }; + for (const auto& rule : globalSizeRules) { + SimpleMathExpression expr; + expr.SetVariables(vars); + expr.SetExpression(rule); + gws.push_back(expr.Evaluate()); + } + for (const auto& rule : localSizeRules) { + SimpleMathExpression expr; + expr.SetVariables(vars); + expr.SetExpression(rule); + lws.push_back(expr.Evaluate()); + } + // std::cout << "======= New gws: "; + // for (auto tmp : gws) { + // std::cout << tmp << ", "; + // } + // std::cout << std::endl << " lws: "; + // for (auto tmp : gws) { + // std::cout << tmp << ", "; + // } + // std::cout << std::endl; } /// @brief Constructs custom_gpu_primitive primitive @@ -70,7 +121,10 @@ struct custom_gpu_primitive : public primitive_base { const layout& output_layout, const std::vector& gws = {}, const std::vector& lws = {}, - const std::shared_ptr& op = nullptr) + const std::shared_ptr& op = nullptr, + const int iidx = -1, + const std::vector globalSizeRules = {}, + const std::vector localSizeRules = {}) : primitive_base(id, inputs, 1, {optional_data_type()}, {output_layout.data_padding}), kernel_entry_point(kernel_entry_point), kernel_arguments(kernel_arguments), @@ -79,7 +133,10 @@ struct custom_gpu_primitive : public primitive_base { gws(gws.size() ? gws : std::vector{output_layout.count()}), lws(lws), kernels_code(kernels_code), - op(op) {} + op(op), + iidx(iidx), + globalSizeRules(globalSizeRules), + localSizeRules(localSizeRules) {} /// @brief The name of the entry point function in the kernel const std::string kernel_entry_point; @@ -97,6 +154,11 @@ struct custom_gpu_primitive : public primitive_base { const primitive_id_arr kernels_code; /// @brief Original IR op const std::shared_ptr op; + /// @brief -1: mean calc gws via output, else calc gws via inputs(iidx) + const int iidx = -1; + /// @brief Custom provided rules for calc work sizes. + const std::vector globalSizeRules; + const std::vector localSizeRules; size_t hash() const override { size_t seed = primitive::hash(); diff --git a/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp b/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp index 0c5fdae43ae850..31f67f8066730f 100644 --- a/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp @@ -36,17 +36,23 @@ kernel_impl_params custom_gpu_primitive_inst::get_fake_aligned_params(kernel_imp OPENVINO_ASSERT(orig_input_layout.is_static() && orig_output_layout.is_static(), "in/out layouts should be static for fake alignment!"); - auto output_shape = orig_output_layout.get_partial_shape().to_shape(); - - // auto op = std::static_pointer_cast(updated_param.desc); - updated_param.custom_op_dynamic_gws = output_shape; - - custom_gpu_primitive::update_work_group_size(std::shared_ptr()); - - // updated_param.output_layouts[0] = orig_output_layout.clone_with_other_shape(output_shape); - // std::cout << "Apply fake alignment: input(" << orig_input_layout.to_short_string() << " -> " - // << updated_param.input_layouts[0].to_short_string() << "), output(" << orig_output_layout.to_short_string() << " -> " - // << updated_param.output_layouts[0].to_short_string() << ")\n"; + auto op = std::static_pointer_cast(updated_param.desc); + + std::vector gws, lws; + custom_gpu_primitive::update_work_group_size(orig_output_layout.get_partial_shape(), + op->iidx, + orig_output_layout.get_partial_shape(), + op->globalSizeRules, + op->localSizeRules, + gws, + lws); + + // GPU_DEBUG_TRACE_DETAIL + std::cout << "Apply fake alignment: gws(" << ov::Shape(updated_param.custom_op_dynamic_gws).to_string() << " -> " << ov::Shape(gws).to_string() << "), lws(" + << ov::Shape(updated_param.custom_op_dynamic_lws).to_string() << " -> " << ov::Shape(lws).to_string() << ")\n"; + + updated_param.custom_op_dynamic_gws = gws; + updated_param.custom_op_dynamic_lws = lws; return updated_param; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index 41522367c1cc95..ec07a206a9f879 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -229,7 +229,7 @@ static std::string get_jit_constant(const custom_gpu_primitive_node& outer, cons mem_consts.AddConstants({ kernel_selector::MakeJitConstant("GLOBAL_WORKSIZE", impl_param.custom_op_dynamic_gws.size() > 0 ? impl_param.custom_op_dynamic_gws : primitive->gws), - kernel_selector::MakeJitConstant("LOCAL_WORKSIZE", primitive->lws), + kernel_selector::MakeJitConstant("LOCAL_WORKSIZE", impl_param.custom_op_dynamic_lws.size() > 0 ? impl_param.custom_op_dynamic_lws : primitive->lws), }); for (size_t i = 0; i < impl_param.input_layouts.size(); i++) { diff --git a/src/plugins/intel_gpu/src/plugin/ops/custom.cpp b/src/plugins/intel_gpu/src/plugin/ops/custom.cpp index e11a848e3f4e60..7da8211d321ac0 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/custom.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/custom.cpp @@ -170,7 +170,7 @@ void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr& op, Cust const std::string defineTitle("// Custom Layer User Defines\n"); auto dims = op->get_output_partial_shape(0); - std::cout << "CustomOp output dims=" << dims << ", dims.size()=" << dims.size() << std::endl; + int iidx = customLayer->InputDimSourceIndex(); size_t N = (dims.size() > 0) ? dims[0].is_dynamic() ? -1 : dims[0].get_length() : 1; size_t C = (dims.size() > 1) ? dims[1].is_dynamic() ? -1 : dims[1].get_length() : 1; @@ -185,57 +185,39 @@ void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr& op, Cust outputLayout = cldnn::layout(cldnn::element_type_to_data_type(op->get_output_element_type(0)), outputFormat, outputTensor); } - // evaluate work sizes rules std::vector gws, lws; - // assume output tensor is dimension source by default - int batchDim = N; - int featureDim = C; - int yDim = H; - int xDim = W; - int iidx = customLayer->InputDimSourceIndex(); - - std::string genericLayerName = layer_type_name_ID(op); // if input index is greater than -1, take dimension from input if (iidx >= 0) { if (static_cast(iidx) >= op->get_input_size()) OPENVINO_THROW("Invalid input tensor for index: ", iidx); auto inputDims = op->get_input_shape(iidx); - - xDim = static_cast(inputDims[inputDims.size() - 1]); - yDim = dims.size() > 1 ? static_cast(inputDims[inputDims.size() - 2]) : 0; - featureDim = dims.size() > 2 ? static_cast(inputDims[inputDims.size() - 3]) : 0; - batchDim = dims.size() > 3 ? static_cast(inputDims[inputDims.size() - 4]) : 0; - } - const std::map vars = { - { 'b', batchDim } , { 'B', batchDim }, - { 'f', featureDim }, { 'F', featureDim }, - { 'y', yDim }, { 'Y', yDim }, - { 'x', xDim }, { 'X', xDim }, - }; - for (const auto& rule : customLayer->GlobalSizeRules()) { - SimpleMathExpression expr; - expr.SetVariables(vars); - expr.SetExpression(rule); - gws.push_back(expr.Evaluate()); - } - for (const auto& rule : customLayer->LocalSizeRules()) { - SimpleMathExpression expr; - expr.SetVariables(vars); - expr.SetExpression(rule); - lws.push_back(expr.Evaluate()); + cldnn::custom_gpu_primitive::update_work_group_size(dims, iidx, inputDims, customLayer->GlobalSizeRules(), customLayer->LocalSizeRules(), gws, lws); + } else { + cldnn::custom_gpu_primitive::update_work_group_size(dims, + iidx, + ov::PartialShape(), + customLayer->GlobalSizeRules(), + customLayer->LocalSizeRules(), + gws, + lws); } + std::string genericLayerName = layer_type_name_ID(op); + auto customPrim = cldnn::custom_gpu_primitive(genericLayerName, reordered_inputs, - { layerTitle, defineTitle, layerDefines, customLayer->KernelSource() }, + {layerTitle, defineTitle, layerDefines, customLayer->KernelSource()}, customLayer->KernelEntry(), kernelParameters, customLayer->CompilerOptions(), outputLayout, gws, lws, - op); + op, + iidx, + customLayer->GlobalSizeRules(), + customLayer->LocalSizeRules()); p.add_primitive(*op, customPrim); auto prevLayerName = genericLayerName; From 133c3f8d8749991c0e717806fbf0323e5700f706 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Wed, 11 Jun 2025 17:28:14 +0800 Subject: [PATCH 03/19] Clone a new op to make sure original model can be released.Back Signed-off-by: xiping.yan --- src/plugins/intel_gpu/src/plugin/ops/custom.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/ops/custom.cpp b/src/plugins/intel_gpu/src/plugin/ops/custom.cpp index 7da8211d321ac0..ca0773a57d9f7c 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/custom.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/custom.cpp @@ -205,6 +205,14 @@ void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr& op, Cust std::string genericLayerName = layer_type_name_ID(op); + // Clone a new op to make sure original model can be released. + ov::OutputVector new_inputs; + for (size_t i = 0; i < op->get_input_size(); i++) { + auto input = std::make_shared(op->get_input_element_type(i), op->get_input_partial_shape(i)); + new_inputs.emplace_back(input); + } + std::shared_ptr op_bk = op->clone_with_new_inputs(new_inputs); + auto customPrim = cldnn::custom_gpu_primitive(genericLayerName, reordered_inputs, {layerTitle, defineTitle, layerDefines, customLayer->KernelSource()}, @@ -214,7 +222,7 @@ void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr& op, Cust outputLayout, gws, lws, - op, + op_bk, iidx, customLayer->GlobalSizeRules(), customLayer->LocalSizeRules()); From 129baa5476482b465f0204e92fa4565c5ebb306b Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 12 Jun 2025 11:01:42 +0800 Subject: [PATCH 04/19] update debug log, and revert useless update. --- .../include/intel_gpu/primitives/custom_gpu_primitive.hpp | 4 ++-- src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp index 1a26039c62c962..cd1fd9a5173884 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp @@ -147,9 +147,9 @@ struct custom_gpu_primitive : public primitive_base { /// @brief The output layout declared by the primitive const layout output_layout; /// @brief The global working sizes - std::vector gws; + const std::vector gws; /// @brief The local working sizes - std::vector lws; + const std::vector lws; /// @brief Source code for the kernel const primitive_id_arr kernels_code; /// @brief Original IR op diff --git a/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp b/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp index 31f67f8066730f..591d37781aee5e 100644 --- a/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp @@ -47,9 +47,8 @@ kernel_impl_params custom_gpu_primitive_inst::get_fake_aligned_params(kernel_imp gws, lws); - // GPU_DEBUG_TRACE_DETAIL - std::cout << "Apply fake alignment: gws(" << ov::Shape(updated_param.custom_op_dynamic_gws).to_string() << " -> " << ov::Shape(gws).to_string() << "), lws(" - << ov::Shape(updated_param.custom_op_dynamic_lws).to_string() << " -> " << ov::Shape(lws).to_string() << ")\n"; + GPU_DEBUG_TRACE_DETAIL << "Apply fake alignment: gws(" << ov::Shape(op->gws).to_string() << " -> " << ov::Shape(gws).to_string() << "), lws(" + << ov::Shape(op->lws).to_string() << " -> " << ov::Shape(lws).to_string() << ")\n"; updated_param.custom_op_dynamic_gws = gws; updated_param.custom_op_dynamic_lws = lws; From 93339008978ddbdab2a603e318b81fce620ec92a Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 12 Jun 2025 11:08:52 +0800 Subject: [PATCH 05/19] is_dynamic->is_dynamic_input Signed-off-by: xiping.yan --- .../intel_gpu/primitives/custom_gpu_primitive.hpp | 9 --------- .../intel_gpu/src/graph/impls/ocl/custom_primitive.cpp | 2 +- .../src/graph/include/custom_gpu_primitive_inst.h | 10 ++++------ 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp index cd1fd9a5173884..515aa2b30c8c58 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp @@ -91,15 +91,6 @@ struct custom_gpu_primitive : public primitive_base { expr.SetExpression(rule); lws.push_back(expr.Evaluate()); } - // std::cout << "======= New gws: "; - // for (auto tmp : gws) { - // std::cout << tmp << ", "; - // } - // std::cout << std::endl << " lws: "; - // for (auto tmp : gws) { - // std::cout << tmp << ", "; - // } - // std::cout << std::endl; } /// @brief Constructs custom_gpu_primitive primitive diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index ec07a206a9f879..e0e2eb808a5fc9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -72,7 +72,7 @@ struct custom_gpu_primitive_impl : typed_primitive_impl { } void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { - // OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed."); + OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed."); auto& kernel_vec = kernels.begin()->second; _kernels.clear(); _kernels.resize(kernel_vec.size()); diff --git a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h index ecc630e86c402c..3ae0ec957fa4ac 100644 --- a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h @@ -26,21 +26,19 @@ class typed_primitive_inst : public typed_primitive_inst_b "custom_gpu_primitive_node!"); layout output_layout = impl_param.typed_desc()->output_layout; - bool is_dynamic = false; + bool is_dynamic_input = false; const auto inp_sz = impl_param.get_input_layout_size(); for (size_t i = 0; i < inp_sz; i++) { if (impl_param.get_input_layout(i).is_dynamic()) { - is_dynamic = true; + is_dynamic_input = true; break; } } - if (!is_dynamic && output_layout.is_dynamic()) { + if (!is_dynamic_input && output_layout.is_dynamic()) { ov::OutputVector new_inputs; for (size_t i = 0; i < inp_sz; i++) { - auto dt = impl_param.get_input_layout(i).data_type; - // std::make_shared(inType, inputDynamicShapes[0]) - auto input = std::make_shared(dt, impl_param.get_input_layout(i).get_shape()); + auto input = std::make_shared(impl_param.get_input_layout(i).data_type, impl_param.get_input_layout(i).get_shape()); new_inputs.emplace_back(input); } From d13c484ef44dcea12c70a9be953aa9ed4106a424 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Tue, 17 Jun 2025 09:51:43 +0800 Subject: [PATCH 06/19] wrapper get_output_shape --- .../graph/include/custom_gpu_primitive_inst.h | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h index 3ae0ec957fa4ac..c260fe0c22702c 100644 --- a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h @@ -26,28 +26,7 @@ class typed_primitive_inst : public typed_primitive_inst_b "custom_gpu_primitive_node!"); layout output_layout = impl_param.typed_desc()->output_layout; - bool is_dynamic_input = false; - const auto inp_sz = impl_param.get_input_layout_size(); - for (size_t i = 0; i < inp_sz; i++) { - if (impl_param.get_input_layout(i).is_dynamic()) { - is_dynamic_input = true; - break; - } - } - - if (!is_dynamic_input && output_layout.is_dynamic()) { - ov::OutputVector new_inputs; - for (size_t i = 0; i < inp_sz; i++) { - auto input = std::make_shared(impl_param.get_input_layout(i).data_type, impl_param.get_input_layout(i).get_shape()); - new_inputs.emplace_back(input); - } - - auto op = impl_param.typed_desc()->op; - auto new_op = op->clone_with_new_inputs(new_inputs); - new_op->validate_and_infer_types(); - auto new_outp_shape = new_op->get_output_shape(0); - output_layout.set_partial_shape(new_outp_shape); - } + typed_primitive_inst::update_output_shape(impl_param, output_layout); // if the output layout format was set to any, it means the layer output format will be the same as the first input if (output_layout.format == format::any) { @@ -62,6 +41,8 @@ class typed_primitive_inst : public typed_primitive_inst_b "custom_gpu_primitive_node!"); layout output_layout = impl_param.typed_desc()->output_layout; + typed_primitive_inst::update_output_shape(impl_param, output_layout); + // if the output layout format was set to any, it means the layer output format will be the same as the first // input if (output_layout.format == format::any) { @@ -76,6 +57,32 @@ class typed_primitive_inst : public typed_primitive_inst_b public: typed_primitive_inst(network& network, custom_gpu_primitive_node const& node); + +private: + static void update_output_shape(const kernel_impl_params& impl_param, layout& output_layout) { + bool is_dynamic_input = false; + const auto inp_sz = impl_param.get_input_layout_size(); + for (size_t i = 0; i < inp_sz; i++) { + if (impl_param.get_input_layout(i).is_dynamic()) { + is_dynamic_input = true; + break; + } + } + + if (!is_dynamic_input && output_layout.is_dynamic()) { + ov::OutputVector new_inputs; + for (size_t i = 0; i < inp_sz; i++) { + auto input = std::make_shared(impl_param.get_input_layout(i).data_type, impl_param.get_input_layout(i).get_shape()); + new_inputs.emplace_back(input); + } + + auto op = impl_param.typed_desc()->op; + auto new_op = op->clone_with_new_inputs(new_inputs); + new_op->validate_and_infer_types(); + auto new_outp_shape = new_op->get_output_shape(0); + output_layout.set_partial_shape(new_outp_shape); + } + } }; using custom_gpu_primitive_inst = typed_primitive_inst; From 977877a88b77b69810a1672f99a2ab137c10e826 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 19 Jun 2025 15:08:59 +0800 Subject: [PATCH 07/19] Move update gws,lws to primitive_imple create. Remove overrided get_fake_aligned_params. Signed-off-by: xiping.yan --- .../intel_gpu/graph/kernel_impl_params.hpp | 2 -- .../primitives/custom_gpu_primitive.hpp | 14 +++++----- .../src/graph/custom_gpu_primitive.cpp | 27 ------------------ .../src/graph/impls/ocl/custom_primitive.cpp | 28 ++++++++++++++----- .../graph/include/custom_gpu_primitive_inst.h | 2 -- 5 files changed, 28 insertions(+), 45 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp index e887b8ea09fcd0..bd1798f6beb4e4 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp @@ -47,8 +47,6 @@ struct kernel_impl_params final { size_t unique_id; bool _can_be_optimized = false; bool _runtime_skippable = false; - std::vector custom_op_dynamic_gws; - std::vector custom_op_dynamic_lws; std::vector input_layouts; std::vector output_layouts; std::vector input_offsets; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp index 515aa2b30c8c58..97650edf0a2088 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp @@ -49,7 +49,7 @@ struct custom_gpu_primitive : public primitive_base { }; static void update_work_group_size(const ov::PartialShape& dims, - int iidx, + int calcWgDimInputIdx, const ov::PartialShape& inputDims, const std::vector& globalSizeRules, const std::vector& localSizeRules, @@ -61,8 +61,8 @@ struct custom_gpu_primitive : public primitive_base { lws.clear(); int batchDim = 0, featureDim = 0, yDim = 0, xDim = 0; - // if input index is greater than -1, take dimension from input - if (iidx >= 0) { + // if calcWgDimInputIdx is greater than -1, take dimension from input + if (calcWgDimInputIdx >= 0) { xDim = static_cast(GetDim(inputDims[inputDims.size() - 1])); yDim = dims.size() > 1 ? static_cast(GetDim(inputDims[inputDims.size() - 2])) : 0; featureDim = dims.size() > 2 ? static_cast(GetDim(inputDims[inputDims.size() - 3])) : 0; @@ -113,7 +113,7 @@ struct custom_gpu_primitive : public primitive_base { const std::vector& gws = {}, const std::vector& lws = {}, const std::shared_ptr& op = nullptr, - const int iidx = -1, + const int calcWgDimInputIdx = -1, const std::vector globalSizeRules = {}, const std::vector localSizeRules = {}) : primitive_base(id, inputs, 1, {optional_data_type()}, {output_layout.data_padding}), @@ -125,7 +125,7 @@ struct custom_gpu_primitive : public primitive_base { lws(lws), kernels_code(kernels_code), op(op), - iidx(iidx), + calcWgDimInputIdx(calcWgDimInputIdx), globalSizeRules(globalSizeRules), localSizeRules(localSizeRules) {} @@ -145,8 +145,8 @@ struct custom_gpu_primitive : public primitive_base { const primitive_id_arr kernels_code; /// @brief Original IR op const std::shared_ptr op; - /// @brief -1: mean calc gws via output, else calc gws via inputs(iidx) - const int iidx = -1; + /// @brief -1: mean calc gws via output, else calc gws via inputs + const int calcWgDimInputIdx = -1; /// @brief Custom provided rules for calc work sizes. const std::vector globalSizeRules; const std::vector localSizeRules; diff --git a/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp b/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp index 591d37781aee5e..4873c272f800fa 100644 --- a/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/custom_gpu_primitive.cpp @@ -29,33 +29,6 @@ std::string custom_gpu_primitive_inst::to_string(custom_gpu_primitive_node const return primitive_description.str(); } -kernel_impl_params custom_gpu_primitive_inst::get_fake_aligned_params(kernel_impl_params const& orig_impl_param) { - auto updated_param = std::move(orig_impl_param); - const auto& orig_input_layout = orig_impl_param.get_input_layout(); - const auto& orig_output_layout = orig_impl_param.get_output_layout(); - OPENVINO_ASSERT(orig_input_layout.is_static() && orig_output_layout.is_static(), - "in/out layouts should be static for fake alignment!"); - - auto op = std::static_pointer_cast(updated_param.desc); - - std::vector gws, lws; - custom_gpu_primitive::update_work_group_size(orig_output_layout.get_partial_shape(), - op->iidx, - orig_output_layout.get_partial_shape(), - op->globalSizeRules, - op->localSizeRules, - gws, - lws); - - GPU_DEBUG_TRACE_DETAIL << "Apply fake alignment: gws(" << ov::Shape(op->gws).to_string() << " -> " << ov::Shape(gws).to_string() << "), lws(" - << ov::Shape(op->lws).to_string() << " -> " << ov::Shape(lws).to_string() << ")\n"; - - updated_param.custom_op_dynamic_gws = gws; - updated_param.custom_op_dynamic_lws = lws; - - return updated_param; -} - custom_gpu_primitive_inst::typed_primitive_inst(network& network, custom_gpu_primitive_node const& node) : parent(network, node) {} } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index e0e2eb808a5fc9..e47367665434f6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -222,14 +222,16 @@ static void add_layout_to_jit(kernel_selector::jit_constants& mem_consts, const mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_OFFSET", std::to_string(offset))); } -static std::string get_jit_constant(const custom_gpu_primitive_node& outer, const kernel_impl_params& impl_param) { +static std::string get_jit_constant(const custom_gpu_primitive_node& outer, + const kernel_impl_params& impl_param, + const std::vector& gws, + const std::vector& lws) { kernel_selector::jit_constants mem_consts{ kernel_selector::MakeJitConstant("NUM_INPUTS", std::to_string(outer.get_dependencies().size()))}; - const auto primitive = outer.get_primitive().get(); mem_consts.AddConstants({ - kernel_selector::MakeJitConstant("GLOBAL_WORKSIZE", impl_param.custom_op_dynamic_gws.size() > 0 ? impl_param.custom_op_dynamic_gws : primitive->gws), - kernel_selector::MakeJitConstant("LOCAL_WORKSIZE", impl_param.custom_op_dynamic_lws.size() > 0 ? impl_param.custom_op_dynamic_lws : primitive->lws), + kernel_selector::MakeJitConstant("GLOBAL_WORKSIZE", gws), + kernel_selector::MakeJitConstant("LOCAL_WORKSIZE", lws), }); for (size_t i = 0; i < impl_param.input_layouts.size(); i++) { @@ -250,17 +252,29 @@ static std::string get_jit_constant(const custom_gpu_primitive_node& outer, cons static std::unique_ptr create(const custom_gpu_primitive_node& arg, const kernel_impl_params& impl_param) { const auto primitive = arg.get_primitive().get(); + const auto& orig_output_layout = impl_param.get_output_layout(); + OPENVINO_ASSERT(orig_output_layout.is_static(), "out layouts should be static for create primitive_impl!"); + + std::vector gws, lws; + custom_gpu_primitive::update_work_group_size(orig_output_layout.get_partial_shape(), + primitive->calcWgDimInputIdx, + orig_output_layout.get_partial_shape(), + primitive->globalSizeRules, + primitive->localSizeRules, + gws, + lws); + auto cl_kernel = std::make_shared(); cl_kernel->code.kernelString = std::make_shared(); cl_kernel->code.kernelString->entry_point = primitive->kernel_entry_point; cl_kernel->code.kernelString->options = primitive->build_options; - cl_kernel->code.kernelString->jit = get_jit_constant(arg, impl_param); + cl_kernel->code.kernelString->jit = get_jit_constant(arg, impl_param, gws, lws); for (const auto& s : primitive->kernels_code) { cl_kernel->code.kernelString->str += s + "\n"; } - cl_kernel->params.workGroups.global = impl_param.custom_op_dynamic_gws.size() > 0 ? impl_param.custom_op_dynamic_gws : primitive->gws; - cl_kernel->params.workGroups.local = primitive->lws; + cl_kernel->params.workGroups.global = gws; + cl_kernel->params.workGroups.local = lws; for (const auto& p : primitive->kernel_arguments) { cl_kernel->params.arguments.push_back(get_arg(p)); diff --git a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h index c260fe0c22702c..c03ad949e06c41 100644 --- a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h @@ -53,8 +53,6 @@ class typed_primitive_inst : public typed_primitive_inst_b static std::string to_string(custom_gpu_primitive_node const& node); - static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param); - public: typed_primitive_inst(network& network, custom_gpu_primitive_node const& node); From 026a4ee3bf35f1008ad99cc2a1a576def8ceb02a Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Tue, 24 Jun 2025 20:59:36 +0800 Subject: [PATCH 08/19] Fix gpu unit test fail issue. Signed-off-by: xiping.yan --- src/plugins/intel_gpu/tests/unit/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt index f07609ab042853..c2f208c371a201 100644 --- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt @@ -31,6 +31,7 @@ file(GLOB_RECURSE SOURCES_MAIN "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp" "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/usm_host_tensor.cpp" "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/common_utils.cpp" + "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/simple_math.cpp" ) if (NOT ENABLE_ONEDNN_FOR_GPU) From 1a1bf9ff4794f525d144895dd59ecf1ac3408943 Mon Sep 17 00:00:00 2001 From: xipingyan Date: Thu, 3 Jul 2025 15:47:38 +0800 Subject: [PATCH 09/19] Add test case for dynamic shape custom op. Signed-off-by: xipingyan --- .../intel_gpu/tests/functional/CMakeLists.txt | 1 + .../functional/custom_op/custom_op_dynamic.cl | 10 ++ .../custom_op/custom_op_dynamic.cpp | 126 ++++++++++++++++++ .../custom_op/custom_op_dynamic.xml | 13 ++ 4 files changed, 150 insertions(+) create mode 100644 src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl create mode 100644 src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp create mode 100644 src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.xml diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt index e4483000fa9434..43948e773c7d86 100644 --- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt @@ -13,6 +13,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") endif() list(APPEND DEFINES TEST_CUSTOM_OP_CONFIG_PATH="${CMAKE_CURRENT_SOURCE_DIR}/custom_op/custom_op.xml") +list(APPEND DEFINES TEST_CUSTOM_OP_DYNAMIC_CONFIG_PATH="${CMAKE_CURRENT_SOURCE_DIR}/custom_op/custom_op_dynamic.xml") ov_add_test_target( NAME diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl new file mode 100644 index 00000000000000..9f1cef1a9d07fd --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl @@ -0,0 +1,10 @@ +__kernel void custom_add_kernel( + __global const INPUT0_TYPE* inp0, + __global OUTPUT0_TYPE* outp) { + int b = get_global_id(0); + int f = get_global_id(1); + int y = get_global_id(2); + // shape: [-1, 1, 2] + int id = b * 1 * 2 + f * 2 + y; + outp[id] = inp0[id] * alpha + beta; +} diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp new file mode 100644 index 00000000000000..917593f942f477 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "openvino/core/any.hpp" +#include "openvino/core/graph_util.hpp" +#include "openvino/runtime/core.hpp" +#include "openvino/runtime/exec_model_info.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/op/constant.hpp" + +#include "base/ov_behavior_test_utils.hpp" + +using namespace ::testing; + +namespace ov { +namespace test { +namespace intel_gpu { + +class CustomAddOp : public ov::op::Op { +private: + float m_alpha; + float m_beta; + +public: + OPENVINO_OP("CustomAddOp", "gpu_opset"); + + CustomAddOp() = default; + + CustomAddOp(const ov::Output& input, float alpha, float beta) : Op({input}), m_alpha(alpha), m_beta(beta) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override { + set_output_size(1); + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + visitor.on_attribute("alpha", m_alpha); + visitor.on_attribute("beta", m_beta); + return true; + } + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override { + OPENVINO_ASSERT(new_args.size() == 1, "Incorrect number of new arguments"); + return std::make_shared(new_args[0], m_alpha, m_beta); + } + + bool has_evaluate() const override { + return true; + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override { + auto in = inputs[0]; + auto out = outputs[0]; + out.set_shape(in.get_shape()); + for (size_t i = 0; i < out.get_size(); i++) { + out.data()[i] = in.data()[i] * m_alpha + m_beta; + } + return true; + } +}; + +static std::shared_ptr get_simple_model_with_custom_add_op(float alpha, float beta, ov::PartialShape inp_shape) { + auto input = std::make_shared(ov::element::f32, inp_shape); + auto op = std::make_shared(input, alpha, beta); + auto result = std::make_shared(op); + + return std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}, "model_with_custom_op_dynamic"); +} + +TEST(CustomOpDynamic, CanReadValidCustomOpConfig) { + ov::Core core; + core.set_property(ov::test::utils::DEVICE_GPU, {{"CONFIG_FILE", TEST_CUSTOM_OP_DYNAMIC_CONFIG_PATH}}); +} + +TEST(CustomOpDynamic, Accuracy) { + ov::Core core; + float alpha = 1.0, beta = 0.1; + const size_t dim1 = 1, dim2 = 2; + auto model = get_simple_model_with_custom_add_op(alpha, beta, ov::PartialShape{-1, dim1, dim2}); + + ov::AnyMap config = {ov::hint::inference_precision(ov::element::f32), {"CONFIG_FILE", TEST_CUSTOM_OP_DYNAMIC_CONFIG_PATH}}; + auto compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, config); + + auto runtime_graph = compiled_model.get_runtime_model(); + auto ops = runtime_graph->get_ordered_ops(); + + bool found_custom_op = false; + for (auto op : ops) { + if (op->get_rt_info()[ov::exec_model_info::LAYER_TYPE].as() == "CustomGPUPrimitive") { + found_custom_op = true; + break; + } + } + ASSERT_TRUE(found_custom_op); + + auto inp_arr_1 = std::vector{0.2, 0.4}; + auto inp_arr_2 = std::vector{0.2, 0.4, 0.3, 0.5}; + auto inputs = std::vector{ov::Tensor({ov::element::f32}, ov::Shape{1, dim1, dim2}, inp_arr_1.data()), + ov::Tensor({ov::element::f32}, ov::Shape{2, dim1, dim2}, inp_arr_2.data())}; + auto ireq = compiled_model.create_infer_request(); + for (auto input : inputs) { + ireq.set_input_tensor(0, input); + ireq.infer(); + auto output = ireq.get_output_tensor(0); + std::vector actual(output.data(), output.data() + output.get_size()); + + ASSERT_EQ(output.get_element_type(), element::f32); + + float* inp_data = input.data(); + for (size_t i = 0; i < output.get_size(); i++) { + ASSERT_FLOAT_EQ(actual[i], inp_data[i] * alpha + beta); + } + } +} + +} // namespace intel_gpu +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.xml b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.xml new file mode 100644 index 00000000000000..1dcefcb9eedbf3 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + From aed6f422cdfa95af36e067e4fb450054281a05e1 Mon Sep 17 00:00:00 2001 From: xipingyan Date: Thu, 3 Jul 2025 17:05:07 +0800 Subject: [PATCH 10/19] Fix test case build fail issue. Signed-off-by: xipingyan --- .../intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp index 917593f942f477..d7e6504e36fe8f 100644 --- a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp +++ b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp @@ -14,7 +14,7 @@ #include "openvino/runtime/properties.hpp" #include "openvino/op/constant.hpp" -#include "base/ov_behavior_test_utils.hpp" +#include "shared_test_classes/base/ov_behavior_test_utils.hpp" using namespace ::testing; From 5a8363c7ec1a40818f83ec306529a13718380dd6 Mon Sep 17 00:00:00 2001 From: xipingyan Date: Fri, 4 Jul 2025 11:51:28 +0800 Subject: [PATCH 11/19] fix windows build issue. --- .../intel_gpu/src/graph/impls/ocl/custom_primitive.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index e47367665434f6..c4647c4557dd0e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -255,7 +255,7 @@ static std::unique_ptr create(const custom_gpu_primitive_node& a const auto& orig_output_layout = impl_param.get_output_layout(); OPENVINO_ASSERT(orig_output_layout.is_static(), "out layouts should be static for create primitive_impl!"); - std::vector gws, lws; + std::vector gws, lws; custom_gpu_primitive::update_work_group_size(orig_output_layout.get_partial_shape(), primitive->calcWgDimInputIdx, orig_output_layout.get_partial_shape(), @@ -268,7 +268,9 @@ static std::unique_ptr create(const custom_gpu_primitive_node& a cl_kernel->code.kernelString = std::make_shared(); cl_kernel->code.kernelString->entry_point = primitive->kernel_entry_point; cl_kernel->code.kernelString->options = primitive->build_options; - cl_kernel->code.kernelString->jit = get_jit_constant(arg, impl_param, gws, lws); + const std::vector const_gws = gws; + const std::vector const_lws = lws; + cl_kernel->code.kernelString->jit = get_jit_constant(arg, impl_param, const_gws, const_lws); for (const auto& s : primitive->kernels_code) { cl_kernel->code.kernelString->str += s + "\n"; } From 7216436da333f48816c11cc310fcf935a376ca5e Mon Sep 17 00:00:00 2001 From: xipingyan Date: Fri, 4 Jul 2025 15:00:21 +0800 Subject: [PATCH 12/19] fix windows build issue. --- .../intel_gpu/src/graph/impls/ocl/custom_primitive.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index c4647c4557dd0e..cecaf621cdf2e8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -268,8 +268,8 @@ static std::unique_ptr create(const custom_gpu_primitive_node& a cl_kernel->code.kernelString = std::make_shared(); cl_kernel->code.kernelString->entry_point = primitive->kernel_entry_point; cl_kernel->code.kernelString->options = primitive->build_options; - const std::vector const_gws = gws; - const std::vector const_lws = lws; + const std::vector const_gws = gws; + const std::vector const_lws = lws; cl_kernel->code.kernelString->jit = get_jit_constant(arg, impl_param, const_gws, const_lws); for (const auto& s : primitive->kernels_code) { cl_kernel->code.kernelString->str += s + "\n"; From afebea2be83d5277f2c97bbf79a4681476d53800 Mon Sep 17 00:00:00 2001 From: xipingyan Date: Sat, 5 Jul 2025 14:55:39 +0800 Subject: [PATCH 13/19] fix unit test fail: custom_gpu_primitive_f32.add_basic_in2x2x2x2 The root cause was unit test did not pass "global work size rule". Signed-off-by: xipingyan --- .../intel_gpu/src/graph/impls/ocl/custom_primitive.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index cecaf621cdf2e8..0f583a849fb754 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -264,6 +264,13 @@ static std::unique_ptr create(const custom_gpu_primitive_node& a gws, lws); + if (gws.empty()) { + gws = primitive->gws; + } + if (lws.empty()) { + lws = primitive->lws; + } + auto cl_kernel = std::make_shared(); cl_kernel->code.kernelString = std::make_shared(); cl_kernel->code.kernelString->entry_point = primitive->kernel_entry_point; From 9bee3e548c88f37b16ff61fef9ca1c57c270f9b7 Mon Sep 17 00:00:00 2001 From: xipingya Date: Sun, 27 Jul 2025 08:53:51 +0800 Subject: [PATCH 14/19] Regist custom_gpu_primitive with dynamic_shape kernel. Enable shape agnostic. Signed-off-by: xipingya --- src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp | 1 + src/plugins/intel_gpu/src/graph/registry/registry.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index 0f583a849fb754..7e91558e5f3c81 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -295,6 +295,7 @@ static std::unique_ptr create(const custom_gpu_primitive_node& a namespace detail { attach_custom_gpu_primitive_impl::attach_custom_gpu_primitive_impl() { + implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, create, {}); implementation_map::add(cldnn::impl_types::ocl, create, {}); } diff --git a/src/plugins/intel_gpu/src/graph/registry/registry.hpp b/src/plugins/intel_gpu/src/graph/registry/registry.hpp index 1a57a1ed1ac970..ed5b650e153bfc 100644 --- a/src/plugins/intel_gpu/src/graph/registry/registry.hpp +++ b/src/plugins/intel_gpu/src/graph/registry/registry.hpp @@ -170,7 +170,7 @@ REGISTER_DEFAULT_IMPLS(adaptive_pooling, OCL_S); REGISTER_DEFAULT_IMPLS(batch_to_space, OCL_S); REGISTER_DEFAULT_IMPLS(border, OCL_S, OCL_D); REGISTER_DEFAULT_IMPLS(bucketize, OCL_S); -REGISTER_DEFAULT_IMPLS(custom_gpu_primitive, OCL_S); +REGISTER_DEFAULT_IMPLS(custom_gpu_primitive, OCL_S, OCL_D); REGISTER_DEFAULT_IMPLS(data, COMMON_S, COMMON_D); REGISTER_DEFAULT_IMPLS(depth_to_space, OCL_S); REGISTER_DEFAULT_IMPLS(dft, OCL_S); From a3c26c1cce689914a18368064fa7f7fde291940d Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 30 Jul 2025 04:03:19 +0800 Subject: [PATCH 15/19] 1: test kernel: get index based on macro 2: more dynamic dim for test. 3: revert dynamic shape kernel register. 4: Add comment about check if inference output shape. Signed-off-by: xipingya --- .../src/graph/impls/ocl/custom_primitive.cpp | 2 +- .../graph/include/custom_gpu_primitive_inst.h | 2 ++ .../intel_gpu/src/graph/registry/registry.hpp | 2 +- .../functional/custom_op/custom_op_dynamic.cl | 17 +++++++++++------ .../functional/custom_op/custom_op_dynamic.cpp | 12 ++++++------ 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index 7e91558e5f3c81..4189c92a73eb26 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -295,7 +295,7 @@ static std::unique_ptr create(const custom_gpu_primitive_node& a namespace detail { attach_custom_gpu_primitive_impl::attach_custom_gpu_primitive_impl() { - implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, create, {}); + // implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, create, {}); implementation_map::add(cldnn::impl_types::ocl, create, {}); } diff --git a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h index c03ad949e06c41..c50231eeca996f 100644 --- a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h @@ -67,6 +67,8 @@ class typed_primitive_inst : public typed_primitive_inst_b } } + // Execute the op's shape inference only for dynamic node when input shapes have already been calculated; otherwise, keep the original output layout + // unchanged (it will be either static for static model or have dynamic shape in case of dynamic flow) if (!is_dynamic_input && output_layout.is_dynamic()) { ov::OutputVector new_inputs; for (size_t i = 0; i < inp_sz; i++) { diff --git a/src/plugins/intel_gpu/src/graph/registry/registry.hpp b/src/plugins/intel_gpu/src/graph/registry/registry.hpp index ed5b650e153bfc..1a57a1ed1ac970 100644 --- a/src/plugins/intel_gpu/src/graph/registry/registry.hpp +++ b/src/plugins/intel_gpu/src/graph/registry/registry.hpp @@ -170,7 +170,7 @@ REGISTER_DEFAULT_IMPLS(adaptive_pooling, OCL_S); REGISTER_DEFAULT_IMPLS(batch_to_space, OCL_S); REGISTER_DEFAULT_IMPLS(border, OCL_S, OCL_D); REGISTER_DEFAULT_IMPLS(bucketize, OCL_S); -REGISTER_DEFAULT_IMPLS(custom_gpu_primitive, OCL_S, OCL_D); +REGISTER_DEFAULT_IMPLS(custom_gpu_primitive, OCL_S); REGISTER_DEFAULT_IMPLS(data, COMMON_S, COMMON_D); REGISTER_DEFAULT_IMPLS(depth_to_space, OCL_S); REGISTER_DEFAULT_IMPLS(dft, OCL_S); diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl index 9f1cef1a9d07fd..67b27228053a16 100644 --- a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl +++ b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl @@ -1,10 +1,15 @@ __kernel void custom_add_kernel( __global const INPUT0_TYPE* inp0, __global OUTPUT0_TYPE* outp) { - int b = get_global_id(0); - int f = get_global_id(1); - int y = get_global_id(2); - // shape: [-1, 1, 2] - int id = b * 1 * 2 + f * 2 + y; - outp[id] = inp0[id] * alpha + beta; + const uint b = (uint)get_global_id(0); + const uint f = (uint)get_global_id(1); + const uint y = (uint)get_global_id(2); + #if INPUT0_DIMS_SIZE == 4 + const uint x = 0; + #endif + + const unsigned src_index = b*INPUT0_DIMS[1]*INPUT0_DIMS[2]*INPUT0_DIMS[3] + f*INPUT0_DIMS[2]*INPUT0_DIMS[3] + y*INPUT0_DIMS[3] + x; + const unsigned dst_index = src_index; + + outp[dst_index] = inp0[src_index] * alpha + beta; } diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp index d7e6504e36fe8f..36b19d6be1f39d 100644 --- a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp +++ b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp @@ -80,11 +80,11 @@ TEST(CustomOpDynamic, CanReadValidCustomOpConfig) { core.set_property(ov::test::utils::DEVICE_GPU, {{"CONFIG_FILE", TEST_CUSTOM_OP_DYNAMIC_CONFIG_PATH}}); } -TEST(CustomOpDynamic, Accuracy) { +TEST(smoke_CustomOpDynamic, Accuracy) { ov::Core core; float alpha = 1.0, beta = 0.1; - const size_t dim1 = 1, dim2 = 2; - auto model = get_simple_model_with_custom_add_op(alpha, beta, ov::PartialShape{-1, dim1, dim2}); + const size_t dim1 = 1; + auto model = get_simple_model_with_custom_add_op(alpha, beta, ov::PartialShape{-1, dim1, -1}); ov::AnyMap config = {ov::hint::inference_precision(ov::element::f32), {"CONFIG_FILE", TEST_CUSTOM_OP_DYNAMIC_CONFIG_PATH}}; auto compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, config); @@ -102,9 +102,9 @@ TEST(CustomOpDynamic, Accuracy) { ASSERT_TRUE(found_custom_op); auto inp_arr_1 = std::vector{0.2, 0.4}; - auto inp_arr_2 = std::vector{0.2, 0.4, 0.3, 0.5}; - auto inputs = std::vector{ov::Tensor({ov::element::f32}, ov::Shape{1, dim1, dim2}, inp_arr_1.data()), - ov::Tensor({ov::element::f32}, ov::Shape{2, dim1, dim2}, inp_arr_2.data())}; + auto inp_arr_2 = std::vector{0.2, 0.4, 0.3, 0.5, 0.7, 0.9}; + auto inputs = std::vector{ov::Tensor({ov::element::f32}, ov::Shape{1, dim1, 2}, inp_arr_1.data()), + ov::Tensor({ov::element::f32}, ov::Shape{2, dim1, 3}, inp_arr_2.data())}; auto ireq = compiled_model.create_infer_request(); for (auto input : inputs) { ireq.set_input_tensor(0, input); From 0f64fd8d2a83a2559a6ffa12bef308f2fd364ca6 Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 30 Jul 2025 05:22:47 +0800 Subject: [PATCH 16/19] Override get_shape_infer_dependencies Signed-off-by: xipingya --- .../intel_gpu/src/graph/impls/ocl/custom_primitive.cpp | 1 - .../src/graph/include/custom_gpu_primitive_inst.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index 4189c92a73eb26..0f583a849fb754 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -295,7 +295,6 @@ static std::unique_ptr create(const custom_gpu_primitive_node& a namespace detail { attach_custom_gpu_primitive_impl::attach_custom_gpu_primitive_impl() { - // implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, create, {}); implementation_map::add(cldnn::impl_types::ocl, create, {}); } diff --git a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h index c50231eeca996f..333595d8851ee6 100644 --- a/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/custom_gpu_primitive_inst.h @@ -11,6 +11,16 @@ namespace cldnn { +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + typed_program_node(std::shared_ptr prim, program& prog) : parent(prim, prog) {} + program_node& input() const { return get_dependency(0); } + + std::vector get_shape_infer_dependencies() const override { return {}; } +}; using custom_gpu_primitive_node = typed_program_node; template <> From 6e3dd7f3d95ef96cbd7f7e2e8b061ebcc4ee4b9c Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 1 Aug 2025 10:50:33 +0800 Subject: [PATCH 17/19] Fix ci issue. Use "ov::test::utils::createFile() and ov::test::utils::removeFile()" to manage config and cl files. Signed-off-by: xipingya --- .../intel_gpu/tests/functional/CMakeLists.txt | 1 - .../functional/custom_op/custom_op_dynamic.cl | 15 ----- .../custom_op/custom_op_dynamic.cpp | 55 ++++++++++++++++++- .../custom_op/custom_op_dynamic.xml | 13 ----- 4 files changed, 53 insertions(+), 31 deletions(-) delete mode 100644 src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl delete mode 100644 src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.xml diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt index 9ecfa4b1e06e38..c89083a026aed6 100644 --- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt @@ -13,7 +13,6 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") endif() list(APPEND DEFINES TEST_CUSTOM_OP_CONFIG_PATH="${CMAKE_CURRENT_SOURCE_DIR}/custom_op/custom_op.xml") -list(APPEND DEFINES TEST_CUSTOM_OP_DYNAMIC_CONFIG_PATH="${CMAKE_CURRENT_SOURCE_DIR}/custom_op/custom_op_dynamic.xml") ov_add_test_target( NAME diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl deleted file mode 100644 index 67b27228053a16..00000000000000 --- a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cl +++ /dev/null @@ -1,15 +0,0 @@ -__kernel void custom_add_kernel( - __global const INPUT0_TYPE* inp0, - __global OUTPUT0_TYPE* outp) { - const uint b = (uint)get_global_id(0); - const uint f = (uint)get_global_id(1); - const uint y = (uint)get_global_id(2); - #if INPUT0_DIMS_SIZE == 4 - const uint x = 0; - #endif - - const unsigned src_index = b*INPUT0_DIMS[1]*INPUT0_DIMS[2]*INPUT0_DIMS[3] + f*INPUT0_DIMS[2]*INPUT0_DIMS[3] + y*INPUT0_DIMS[3] + x; - const unsigned dst_index = src_index; - - outp[dst_index] = inp0[src_index] * alpha + beta; -} diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp index 36b19d6be1f39d..98019037bda75e 100644 --- a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp +++ b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp @@ -75,9 +75,57 @@ static std::shared_ptr get_simple_model_with_custom_add_op(float alph return std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}, "model_with_custom_op_dynamic"); } +static std::pair generate_config_files() { + std::string config_cl = ov::test::utils::generateTestFilePrefix() + "_custom_op_dynamic.cl"; + std::string config_xml = ov::test::utils::generateTestFilePrefix() + "_custom_op_dynamic.xml"; + + std::string content_cl = R"( + __kernel void custom_add_kernel( + __global const INPUT0_TYPE* inp0, + __global OUTPUT0_TYPE* outp) { + const uint b = (uint)get_global_id(0); + const uint f = (uint)get_global_id(1); + const uint y = (uint)get_global_id(2); + #if INPUT0_DIMS_SIZE == 4 + const uint x = 0; + #endif + + const unsigned src_index = b*INPUT0_DIMS[1]*INPUT0_DIMS[2]*INPUT0_DIMS[3] + f*INPUT0_DIMS[2]*INPUT0_DIMS[3] + y*INPUT0_DIMS[3] + x; + const unsigned dst_index = src_index; + + outp[dst_index] = inp0[src_index] * alpha + beta; + })"; + + std::string content_xml = R"( + + + + + + + + + + + + + )"; + + ov::test::utils::createFile(config_cl, content_cl); + ov::test::utils::createFile(config_xml, content_xml); + return {config_xml, config_cl}; +} + +static void remove_configs(std::pair config_files) { + ov::test::utils::removeFile(config_files.first.c_str()); + ov::test::utils::removeFile(config_files.second.c_str()); +} + TEST(CustomOpDynamic, CanReadValidCustomOpConfig) { ov::Core core; - core.set_property(ov::test::utils::DEVICE_GPU, {{"CONFIG_FILE", TEST_CUSTOM_OP_DYNAMIC_CONFIG_PATH}}); + auto config_files = generate_config_files(); + core.set_property(ov::test::utils::DEVICE_GPU, {{"CONFIG_FILE", config_files.first}}); + remove_configs(config_files); } TEST(smoke_CustomOpDynamic, Accuracy) { @@ -86,7 +134,8 @@ TEST(smoke_CustomOpDynamic, Accuracy) { const size_t dim1 = 1; auto model = get_simple_model_with_custom_add_op(alpha, beta, ov::PartialShape{-1, dim1, -1}); - ov::AnyMap config = {ov::hint::inference_precision(ov::element::f32), {"CONFIG_FILE", TEST_CUSTOM_OP_DYNAMIC_CONFIG_PATH}}; + auto config_files = generate_config_files(); + ov::AnyMap config = {ov::hint::inference_precision(ov::element::f32), {"CONFIG_FILE", config_files.first}}; auto compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, config); auto runtime_graph = compiled_model.get_runtime_model(); @@ -119,6 +168,8 @@ TEST(smoke_CustomOpDynamic, Accuracy) { ASSERT_FLOAT_EQ(actual[i], inp_data[i] * alpha + beta); } } + + remove_configs(config_files); } } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.xml b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.xml deleted file mode 100644 index 1dcefcb9eedbf3..00000000000000 --- a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - From c1473b8333cc05254591e16dcbbc6e7e61b3da99 Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 1 Aug 2025 16:16:41 +0800 Subject: [PATCH 18/19] move generateTestFilePrefix to setup and teardown. Signed-off-by: xipingya --- .../custom_op/custom_op_dynamic.cpp | 224 ++++++++++-------- 1 file changed, 124 insertions(+), 100 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp index 98019037bda75e..7c6984d118f5a5 100644 --- a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp +++ b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp @@ -3,12 +3,8 @@ // #include -#include #include -#include -#include "openvino/core/any.hpp" -#include "openvino/core/graph_util.hpp" #include "openvino/runtime/core.hpp" #include "openvino/runtime/exec_model_info.hpp" #include "openvino/runtime/properties.hpp" @@ -67,111 +63,139 @@ class CustomAddOp : public ov::op::Op { } }; -static std::shared_ptr get_simple_model_with_custom_add_op(float alpha, float beta, ov::PartialShape inp_shape) { - auto input = std::make_shared(ov::element::f32, inp_shape); - auto op = std::make_shared(input, alpha, beta); - auto result = std::make_shared(op); +using CustomOpDynamicTestParams = std::tuple, // input shape + std::vector>>; // input data +class CustomOpDynamic : public ov::test::TestsCommon, public testing::WithParamInterface { + void SetUp() override { + generate_config_files(); + }; - return std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}, "model_with_custom_op_dynamic"); -} - -static std::pair generate_config_files() { - std::string config_cl = ov::test::utils::generateTestFilePrefix() + "_custom_op_dynamic.cl"; - std::string config_xml = ov::test::utils::generateTestFilePrefix() + "_custom_op_dynamic.xml"; - - std::string content_cl = R"( - __kernel void custom_add_kernel( - __global const INPUT0_TYPE* inp0, - __global OUTPUT0_TYPE* outp) { - const uint b = (uint)get_global_id(0); - const uint f = (uint)get_global_id(1); - const uint y = (uint)get_global_id(2); - #if INPUT0_DIMS_SIZE == 4 - const uint x = 0; - #endif - - const unsigned src_index = b*INPUT0_DIMS[1]*INPUT0_DIMS[2]*INPUT0_DIMS[3] + f*INPUT0_DIMS[2]*INPUT0_DIMS[3] + y*INPUT0_DIMS[3] + x; - const unsigned dst_index = src_index; - - outp[dst_index] = inp0[src_index] * alpha + beta; - })"; - - std::string content_xml = R"( - - - - - - - - - - - - - )"; - - ov::test::utils::createFile(config_cl, content_cl); - ov::test::utils::createFile(config_xml, content_xml); - return {config_xml, config_cl}; -} - -static void remove_configs(std::pair config_files) { - ov::test::utils::removeFile(config_files.first.c_str()); - ov::test::utils::removeFile(config_files.second.c_str()); -} - -TEST(CustomOpDynamic, CanReadValidCustomOpConfig) { - ov::Core core; - auto config_files = generate_config_files(); - core.set_property(ov::test::utils::DEVICE_GPU, {{"CONFIG_FILE", config_files.first}}); - remove_configs(config_files); -} - -TEST(smoke_CustomOpDynamic, Accuracy) { - ov::Core core; - float alpha = 1.0, beta = 0.1; - const size_t dim1 = 1; - auto model = get_simple_model_with_custom_add_op(alpha, beta, ov::PartialShape{-1, dim1, -1}); - - auto config_files = generate_config_files(); - ov::AnyMap config = {ov::hint::inference_precision(ov::element::f32), {"CONFIG_FILE", config_files.first}}; - auto compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, config); - - auto runtime_graph = compiled_model.get_runtime_model(); - auto ops = runtime_graph->get_ordered_ops(); + void TearDown() override { + ov::test::utils::removeFile(config_cl); + ov::test::utils::removeFile(config_xml); + } - bool found_custom_op = false; - for (auto op : ops) { - if (op->get_rt_info()[ov::exec_model_info::LAYER_TYPE].as() == "CustomGPUPrimitive") { - found_custom_op = true; - break; +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + std::vector input_shapes; + std::vector> input_datas; + std::tie(input_shapes, input_datas) = obj.param; + + std::ostringstream result; + result << "input_shape="; + for (auto shape : input_shapes) { + result << shape; } + return result.str(); } - ASSERT_TRUE(found_custom_op); - - auto inp_arr_1 = std::vector{0.2, 0.4}; - auto inp_arr_2 = std::vector{0.2, 0.4, 0.3, 0.5, 0.7, 0.9}; - auto inputs = std::vector{ov::Tensor({ov::element::f32}, ov::Shape{1, dim1, 2}, inp_arr_1.data()), - ov::Tensor({ov::element::f32}, ov::Shape{2, dim1, 3}, inp_arr_2.data())}; - auto ireq = compiled_model.create_infer_request(); - for (auto input : inputs) { - ireq.set_input_tensor(0, input); - ireq.infer(); - auto output = ireq.get_output_tensor(0); - std::vector actual(output.data(), output.data() + output.get_size()); - - ASSERT_EQ(output.get_element_type(), element::f32); - - float* inp_data = input.data(); - for (size_t i = 0; i < output.get_size(); i++) { - ASSERT_FLOAT_EQ(actual[i], inp_data[i] * alpha + beta); + + static const size_t dim1 = 1; + void run() { + std::vector input_shapes; + std::vector> input_datas; + std::tie(input_shapes, input_datas) = GetParam(); + ASSERT_TRUE(input_shapes.size() == input_datas.size()); + + ov::Core core; + float alpha = 1.0, beta = 0.1; + auto model = generate_model_with_custom_add_op(alpha, beta, ov::PartialShape{-1, dim1, -1}); + + ov::AnyMap config = {ov::hint::inference_precision(ov::element::f32), {"CONFIG_FILE", config_xml}}; + auto compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, config); + + auto runtime_graph = compiled_model.get_runtime_model(); + auto ops = runtime_graph->get_ordered_ops(); + + bool found_custom_op = false; + for (auto op : ops) { + if (op->get_rt_info()[ov::exec_model_info::LAYER_TYPE].as() == "CustomGPUPrimitive") { + found_custom_op = true; + break; + } + } + ASSERT_TRUE(found_custom_op); + + auto ireq = compiled_model.create_infer_request(); + for (size_t i = 0; i < input_datas.size(); i++) { + auto input = ov::Tensor({ov::element::f32}, input_shapes[i], input_datas[i].data()); + ireq.set_input_tensor(0, input); + ireq.infer(); + auto output = ireq.get_output_tensor(0); + std::vector actual(output.data(), output.data() + output.get_size()); + + ASSERT_EQ(output.get_element_type(), element::f32); + + float* inp_data = input.data(); + for (size_t i = 0; i < output.get_size(); i++) { + ASSERT_FLOAT_EQ(actual[i], inp_data[i] * alpha + beta); + } } } - remove_configs(config_files); +private: + std::string config_cl; + std::string config_xml; + + void generate_config_files() { + config_cl = ov::test::utils::generateTestFilePrefix() + "_custom_op_dynamic.cl"; + config_xml = ov::test::utils::generateTestFilePrefix() + "_custom_op_dynamic.xml"; + + std::string content_cl = R"( + __kernel void custom_add_kernel( + __global const INPUT0_TYPE* inp0, + __global OUTPUT0_TYPE* outp) { + const uint b = (uint)get_global_id(0); + const uint f = (uint)get_global_id(1); + const uint y = (uint)get_global_id(2); + #if INPUT0_DIMS_SIZE == 4 + const uint x = 0; + #endif + + const unsigned src_index = b*INPUT0_DIMS[1]*INPUT0_DIMS[2]*INPUT0_DIMS[3] + f*INPUT0_DIMS[2]*INPUT0_DIMS[3] + y*INPUT0_DIMS[3] + x; + const unsigned dst_index = src_index; + + outp[dst_index] = inp0[src_index] * alpha + beta; + })"; + + std::string content_xml = R"( + + + + + + + + + + + + + )"; + + ov::test::utils::createFile(config_cl, content_cl); + ov::test::utils::createFile(config_xml, content_xml); + } + + std::shared_ptr generate_model_with_custom_add_op(float alpha, float beta, ov::PartialShape inp_shape) { + auto input = std::make_shared(ov::element::f32, inp_shape); + auto op = std::make_shared(input, alpha, beta); + auto result = std::make_shared(op); + return std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}, "model_with_custom_op_dynamic"); + } +}; + +TEST_P(CustomOpDynamic, Accuracy) { + run(); } +const std::vector input_shapes{{1, CustomOpDynamic::dim1, 2}, {2, CustomOpDynamic::dim1, 3}}; +const std::vector> input_datas{{0.2, 0.4}, {0.2, 0.4, 0.3, 0.5, 0.7, 0.9}}; + +INSTANTIATE_TEST_SUITE_P(smoke_GPU_Accuracy, CustomOpDynamic, + ::testing::Combine(::testing::Values(input_shapes), + ::testing::Values(input_datas)), + CustomOpDynamic::getTestCaseName); + } // namespace intel_gpu } // namespace test } // namespace ov From 422c3abdfe940ec1147691e7d7e90f0dda6ff4fa Mon Sep 17 00:00:00 2001 From: xipingya Date: Sat, 2 Aug 2025 11:56:22 +0800 Subject: [PATCH 19/19] Add test: custom op static model accuracy test. Signed-off-by: xipingya --- .../custom_op/custom_op_dynamic.cpp | 75 ++++++++++++++++--- 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp index 7c6984d118f5a5..7509d0fb5d5e70 100644 --- a/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp +++ b/src/plugins/intel_gpu/tests/functional/custom_op/custom_op_dynamic.cpp @@ -5,11 +5,10 @@ #include #include +#include "openvino/op/constant.hpp" #include "openvino/runtime/core.hpp" #include "openvino/runtime/exec_model_info.hpp" #include "openvino/runtime/properties.hpp" -#include "openvino/op/constant.hpp" - #include "shared_test_classes/base/ov_behavior_test_utils.hpp" using namespace ::testing; @@ -132,7 +131,7 @@ class CustomOpDynamic : public ov::test::TestsCommon, public testing::WithParamI } } -private: +protected: std::string config_cl; std::string config_xml; @@ -184,18 +183,74 @@ class CustomOpDynamic : public ov::test::TestsCommon, public testing::WithParamI } }; +class CustomOpStatic : public CustomOpDynamic { +public: + void run() { + std::vector input_shapes; + std::vector> input_datas; + std::tie(input_shapes, input_datas) = GetParam(); + ASSERT_EQ(input_shapes.size(), input_datas.size()); + ASSERT_EQ(input_shapes.size(), 1u); + + ov::Core core; + float alpha = 1.0, beta = 0.1; + auto model = generate_model_with_custom_add_op(alpha, beta, ov::PartialShape(input_shapes[0])); + + ov::AnyMap config = {ov::hint::inference_precision(ov::element::f32), {"CONFIG_FILE", config_xml}}; + auto compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, config); + + auto runtime_graph = compiled_model.get_runtime_model(); + auto ops = runtime_graph->get_ordered_ops(); + + bool found_custom_op = false; + for (auto op : ops) { + if (op->get_rt_info()[ov::exec_model_info::LAYER_TYPE].as() == "CustomGPUPrimitive") { + found_custom_op = true; + break; + } + } + ASSERT_TRUE(found_custom_op); + + auto ireq = compiled_model.create_infer_request(); + auto input = ov::Tensor({ov::element::f32}, input_shapes[0], input_datas[0].data()); + ireq.set_input_tensor(0, input); + ireq.infer(); + auto output = ireq.get_output_tensor(0); + std::vector actual(output.data(), output.data() + output.get_size()); + + ASSERT_EQ(output.get_element_type(), element::f32); + + float* inp_data = input.data(); + for (size_t i = 0; i < output.get_size(); i++) { + ASSERT_FLOAT_EQ(actual[i], inp_data[i] * alpha + beta); + } + } +}; + TEST_P(CustomOpDynamic, Accuracy) { run(); } +TEST_P(CustomOpStatic, Accuracy) { + run(); +} + const std::vector input_shapes{{1, CustomOpDynamic::dim1, 2}, {2, CustomOpDynamic::dim1, 3}}; const std::vector> input_datas{{0.2, 0.4}, {0.2, 0.4, 0.3, 0.5, 0.7, 0.9}}; -INSTANTIATE_TEST_SUITE_P(smoke_GPU_Accuracy, CustomOpDynamic, - ::testing::Combine(::testing::Values(input_shapes), - ::testing::Values(input_datas)), - CustomOpDynamic::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_GPU_Accuracy, + CustomOpDynamic, + ::testing::Combine(::testing::Values(input_shapes), ::testing::Values(input_datas)), + CustomOpDynamic::getTestCaseName); + +const std::vector input_static_shapes{{2, 2, 3}}; +const std::vector> input_static_datas{{0.2, 0.4, 0.3, 0.5, 0.7, 0.9, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6}}; + +INSTANTIATE_TEST_SUITE_P(smoke_GPU_Accuracy, + CustomOpStatic, + ::testing::Combine(::testing::Values(input_static_shapes), ::testing::Values(input_static_datas)), + CustomOpStatic::getTestCaseName); -} // namespace intel_gpu -} // namespace test -} // namespace ov +} // namespace intel_gpu +} // namespace test +} // namespace ov