Skip to content

Commit 429c726

Browse files
[GPU] Implement NMS-9 operation (#11890) (#12760)
* Fix GPU NonMaxSuppression implementation * Introduce Nms9 single layer tests * Adapt internal NMS and GPU implementation for NMS9 implementation * Adapt CPU implementation in GPU for NMS9 * Add blocked layouts support to NMS * Add unit tests for blocked formats for NMS * Fix boxes groups size for the small shapes * Use ocl implementation for blocked layout input * Fix templates typedefs to pass win build * Fix second output to set data in correct format Co-authored-by: Tetiana Gubanova <tgubanova@lohika.com>
1 parent 7123433 commit 429c726

File tree

16 files changed

+613
-409
lines changed

16 files changed

+613
-409
lines changed

src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
#include <vector>
1212
#include <queue>
1313
#include <algorithm>
14-
#include <utility>
1514
#include <tuple>
1615

1716
namespace cldnn {
@@ -45,12 +44,15 @@ std::vector<result_indices> run_nms(
4544
return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx));
4645
};
4746
float scale = 0.0f;
47+
bool soft_nms = false;
4848
if (soft_nms_sigma > 0.0f) {
4949
scale = -0.5f / soft_nms_sigma;
50+
soft_nms = true;
5051
}
52+
5153
auto coeff = [&](float iou) {
5254
const float weight = std::exp(scale * iou * iou);
53-
return iou <= iou_threshold ? weight : 0.0f;
55+
return (iou <= iou_threshold || soft_nms) ? weight : 0.0f;
5456
};
5557
std::vector<result_indices> result;
5658

@@ -75,7 +77,7 @@ std::vector<result_indices> run_nms(
7577
float iou_boxes = iou(boxes[bi][currBox.idx], boxes[bi][fb[idx].box_index]);
7678

7779
currBox.score *= coeff(iou_boxes);
78-
if (iou_boxes >= iou_threshold) {
80+
if (iou_boxes >= iou_threshold && !soft_nms) {
7981
box_is_selected = false;
8082
break;
8183
}
@@ -98,13 +100,12 @@ std::vector<result_indices> run_nms(
98100
}
99101

100102
if (sort_result_descending) {
101-
std::sort(result.begin(), result.end(),
102-
[](const result_indices& l, const result_indices& r) {
103-
return (l.score > r.score) ||
104-
(l.score == r.score && l.batch_index < r.batch_index) ||
105-
(l.score == r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) ||
106-
(l.score == r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index);
107-
});
103+
std::sort(result.begin(), result.end(), [](const result_indices& l, const result_indices& r) {
104+
return (l.score > r.score) || (l.score == r.score && l.batch_index < r.batch_index) ||
105+
(l.score == r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) ||
106+
(l.score == r.score && l.batch_index == r.batch_index && l.class_index == r.class_index &&
107+
l.box_index < r.box_index);
108+
});
108109
}
109110
return result;
110111
}
@@ -125,12 +126,11 @@ vector2D<bounding_box> load_boxes_impl(stream& stream, memory::ptr mem, bool cen
125126
for (int bxi = 0; bxi < boxes_num; ++bxi) {
126127
int offset = bi * boxes_num * 4 + bxi * 4;
127128
if (center_point) {
128-
result[bi].emplace_back(
129-
static_cast<float>(ptr[offset + 0]),
130-
static_cast<float>(ptr[offset + 1]),
131-
static_cast<float>(ptr[offset + 2]),
132-
static_cast<float>(ptr[offset + 3]),
133-
bounding_box::center_point_construct_tag());
129+
result[bi].emplace_back(static_cast<float>(ptr[offset + 0]),
130+
static_cast<float>(ptr[offset + 1]),
131+
static_cast<float>(ptr[offset + 2]),
132+
static_cast<float>(ptr[offset + 3]),
133+
bounding_box::center_point_construct_tag());
134134
} else {
135135
result[bi].emplace_back(
136136
static_cast<float>(ptr[offset + 1]),
@@ -357,7 +357,13 @@ void run(non_max_suppression_inst& instance) {
357357
soft_nms_sigma = load_scalar<float>(stream, instance.soft_nms_sigma_mem());
358358
}
359359

360-
auto result = run_nms(boxes, scores, num_select_per_class, score_threshold, iou_threshold, soft_nms_sigma, prim->sort_result_descending);
360+
auto result = run_nms(boxes,
361+
scores,
362+
num_select_per_class,
363+
score_threshold,
364+
iou_threshold,
365+
soft_nms_sigma,
366+
prim->sort_result_descending);
361367

362368
if (instance.has_third_output()) {
363369
store_third_output(stream, instance.third_output_mem(), result);

src/plugins/intel_gpu/src/graph/impls/ocl/non_max_suppression.cpp

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,12 @@
33
//
44

55
#include "data_inst.h"
6+
#include "kernel_selector_helper.h"
7+
#include "non_max_suppression/non_max_suppression_kernel_ref.h"
8+
#include "non_max_suppression/non_max_suppression_kernel_selector.h"
69
#include "non_max_suppression_inst.h"
710
#include "primitive_base.hpp"
811
#include "impls/implementation_map.hpp"
9-
#include "kernel_selector_helper.h"
10-
#include "non_max_suppression/non_max_suppression_kernel_selector.h"
11-
#include "non_max_suppression/non_max_suppression_kernel_ref.h"
12-
1312

1413
namespace cldnn {
1514
namespace ocl {
@@ -44,7 +43,7 @@ struct non_max_suppression_impl : typed_primitive_impl_ocl<non_max_suppression>
4443
args.inputs.push_back(instance.soft_nms_sigma_mem());
4544
}
4645

47-
args.outputs = { instance.output_memory_ptr() };
46+
args.outputs = {instance.output_memory_ptr()};
4847
if (instance.has_second_output())
4948
args.inputs.push_back(instance.second_output_mem());
5049
if (instance.has_third_output())
@@ -107,19 +106,22 @@ struct non_max_suppression_impl : typed_primitive_impl_ocl<non_max_suppression>
107106
}
108107

109108
if (arg.has_second_output()) {
110-
params.inputs.push_back(convert_data_tensor(arg.second_output_node().get_output_layout()));
109+
layout second_output_layout = arg.second_output_node().get_output_layout();
110+
second_output_layout.format = arg.input_scores().get_output_layout().format;
111+
params.inputs.push_back(convert_data_tensor(second_output_layout));
111112
params.has_second_output = true;
112113
}
113114

114115
if (arg.has_third_output()) {
115-
params.inputs.push_back(convert_data_tensor(arg.third_output_node().get_output_layout()));
116+
layout third_output_layout = arg.third_output_node().get_output_layout();
117+
third_output_layout.format = arg.input_scores().get_output_layout().format;
118+
params.inputs.push_back(convert_data_tensor(third_output_layout));
116119
params.has_third_output = true;
117120
}
118121

119122
params.sort_result_descending = primitive->sort_result_descending;
120-
params.box_encoding = primitive->center_point_box ?
121-
kernel_selector::BoxEncodingType::BOX_ENCODING_CENTER : kernel_selector::BoxEncodingType::BOX_ENCODING_CORNER;
122-
123+
params.box_encoding = primitive->center_point_box ? kernel_selector::BoxEncodingType::BOX_ENCODING_CENTER
124+
: kernel_selector::BoxEncodingType::BOX_ENCODING_CORNER;
123125
auto& kernel_selector = kernel_selector::non_max_suppression_kernel_selector::Instance();
124126
auto best_kernels = kernel_selector.GetBestKernels(params, optional_params);
125127

@@ -171,11 +173,25 @@ struct non_max_suppression_impl : typed_primitive_impl_ocl<non_max_suppression>
171173
namespace detail {
172174

173175
attach_non_max_suppression_impl::attach_non_max_suppression_impl() {
174-
implementation_map<non_max_suppression>::add(impl_types::ocl, non_max_suppression_impl::create, {
175-
std::make_tuple(data_types::i32, format::bfyx),
176-
std::make_tuple(data_types::f16, format::bfyx),
177-
std::make_tuple(data_types::f32, format::bfyx),
178-
});
176+
implementation_map<non_max_suppression>::add(impl_types::ocl,
177+
non_max_suppression_impl::create,
178+
{
179+
std::make_tuple(data_types::i32, format::bfyx),
180+
181+
std::make_tuple(data_types::f16, format::bfyx),
182+
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
183+
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
184+
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv16_fsv16),
185+
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16),
186+
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
187+
188+
std::make_tuple(data_types::f32, format::bfyx),
189+
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
190+
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
191+
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
192+
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16),
193+
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
194+
});
179195
}
180196

181197
} // namespace detail

src/plugins/intel_gpu/src/graph/layout_optimizer.cpp

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1439,13 +1439,25 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
14391439
else
14401440
preferred_impl = impl_types::cpu;
14411441
} else if (node.is_type<non_max_suppression>()) {
1442-
auto& nms_node = node.as<non_max_suppression>();
1443-
auto scoresTensor = convert_data_tensor(nms_node.input_scores().get_output_layout());
1444-
const size_t kBatchNum = scoresTensor.Batch().v;
1445-
const size_t kClassNum = scoresTensor.Feature().v;
1446-
const size_t kNStreams = static_cast<size_t>(node.get_program().get_engine().configuration().throughput_streams);
1447-
const size_t kKeyValue = kBatchNum * std::min(kClassNum, static_cast<size_t>(8)) * kNStreams;
1448-
preferred_impl = (kKeyValue > 64) ? impl_types::ocl : impl_types::cpu;
1442+
const std::set<format> blocked_formats = {
1443+
format::b_fs_yx_fsv16,
1444+
format::b_fs_yx_fsv32,
1445+
format::bs_fs_yx_bsv16_fsv16,
1446+
format::bs_fs_yx_bsv32_fsv16,
1447+
format::bs_fs_yx_bsv32_fsv32,
1448+
};
1449+
if (blocked_formats.find(node.get_dependency(0).get_output_layout().format) != blocked_formats.end()) {
1450+
preferred_impl = impl_types::ocl;
1451+
} else {
1452+
auto& nms_node = node.as<non_max_suppression>();
1453+
auto scoresTensor = convert_data_tensor(nms_node.input_scores().get_output_layout());
1454+
const size_t kBatchNum = scoresTensor.Batch().v;
1455+
const size_t kClassNum = scoresTensor.Feature().v;
1456+
const size_t kNStreams =
1457+
static_cast<size_t>(node.get_program().get_engine().configuration().throughput_streams);
1458+
const size_t kKeyValue = kBatchNum * std::min(kClassNum, static_cast<size_t>(8)) * kNStreams;
1459+
preferred_impl = (kKeyValue > 64) ? impl_types::ocl : impl_types::cpu;
1460+
}
14491461
} else if (node.is_type<reorder>()) {
14501462
if (!_optimization_attributes.use_onednn_impls)
14511463
return impl_types::ocl;

src/plugins/intel_gpu/src/graph/non_max_suppression.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ layout non_max_suppression_inst::calc_output_layout(non_max_suppression_node con
2020
auto output_type = desc->output_data_type ? *desc->output_data_type : data_types::i32;
2121

2222
auto output_size = tensor(batch(desc->selected_indices_num), feature(3));
23-
24-
return layout(output_type, format::bfyx, output_size);
23+
return layout(output_type, node.input().get_output_layout().format, output_size);
2524
}
2625

2726
std::string non_max_suppression_inst::to_string(non_max_suppression_node const& node) {

src/plugins/intel_gpu/src/graph/program.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,7 +1398,8 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
13981398
prim.type() != cldnn::normalize::type_id() &&
13991399
prim.type() != cldnn::mvn::type_id() &&
14001400
prim.type() != cldnn::gather::type_id() &&
1401-
prim.type() != cldnn::scatter_nd_update::type_id()) {
1401+
prim.type() != cldnn::scatter_nd_update::type_id() &&
1402+
prim.type() != cldnn::non_max_suppression::type_id()) {
14021403
can_use_fsv16 = false;
14031404
}
14041405

@@ -1425,11 +1426,11 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
14251426
prim.type() != cldnn::fully_connected::type_id() &&
14261427
prim.type() != cldnn::generic_layer::type_id() &&
14271428
prim.type() != cldnn::scatter_nd_update::type_id() &&
1428-
prim.type() != cldnn::quantize::type_id())
1429+
prim.type() != cldnn::quantize::type_id() &&
1430+
prim.type() != cldnn::non_max_suppression::type_id())
14291431
can_use_bs_fs_yx_bsv16_fsv16 = false;
14301432
}
14311433

1432-
14331434
size_t total_conv_layers = lo.get_total_conv_count();
14341435
// Due to fact that single winograd convolution is faster than b_fs_yx_fsv16 and
14351436
// using them together leads do redundant reorders, whole topology switch

src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_max_suppression/non_max_suppression_kernel_ref.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ inline std::string GetToInputTypeStr(uint32_t idx) {
5050
return "TO_" + GetInputTypeStr(idx);
5151
}
5252

53+
inline std::string GetToInputIndexStr(uint32_t idx) {
54+
return "INPUT" + std::to_string(idx) + "_GET_INDEX";
55+
}
56+
5357
JitConstants NonMaxSuppressionKernelRef::GetJitConstants(const non_max_suppression_params& params) const {
5458
JitConstants jit = MakeBaseParamsJitConstants(params);
5559

@@ -109,11 +113,13 @@ JitConstants NonMaxSuppressionKernelRef::GetJitConstants(const non_max_suppressi
109113
if (params.has_second_output) {
110114
jit.AddConstant(MakeJitConstant("SECOND_OUTPUT_TYPE", GetInputTypeStr(params.GetIndexSecondOutput())));
111115
jit.AddConstant(MakeJitConstant("TO_SECOND_OUTPUT_TYPE", GetToInputTypeStr(params.GetIndexSecondOutput())));
116+
jit.AddConstant(MakeJitConstant("SECOND_OUTPUT_GET_INDEX", GetToInputIndexStr(params.GetIndexSecondOutput())));
112117
}
113118

114119
if (params.has_third_output) {
115120
jit.AddConstant(MakeJitConstant("THIRD_OUTPUT_TYPE", GetInputTypeStr(params.GetIndexThirdOutput())));
116121
jit.AddConstant(MakeJitConstant("TO_THIRD_OUTPUT_TYPE", GetToInputTypeStr(params.GetIndexThirdOutput())));
122+
jit.AddConstant(MakeJitConstant("THIRD_OUTPUT_GET_INDEX", GetToInputIndexStr(params.GetIndexThirdOutput())));
117123
}
118124

119125
return jit;
@@ -146,8 +152,9 @@ NonMaxSuppressionKernelRef::DispatchData SetDefault(const non_max_suppression_pa
146152

147153
const auto& input = params.inputs[1];
148154
if (idx == 0) {
149-
dispatchData.gws = {input.Batch().v, input.Feature().v, params.engineInfo.maxWorkGroupSize};
150-
dispatchData.lws = {1, 1, params.engineInfo.maxWorkGroupSize};
155+
const size_t boxesGroupSize = std::min(params.inputs[0].Feature().v, params.engineInfo.maxWorkGroupSize);
156+
dispatchData.gws = {input.Batch().v, input.Feature().v, boxesGroupSize};
157+
dispatchData.lws = {1, 1, boxesGroupSize};
151158
} else if (idx == 1) {
152159
const size_t kSplitNum = 16;
153160
dispatchData.gws = {input.Batch().v, input.Feature().v, kSplitNum};
@@ -261,7 +268,7 @@ KernelsData NonMaxSuppressionKernelRef::GetKernelsData(const Params& params, con
261268
// Build clKernelData.
262269
for (size_t i = 0; i < kKernelsNum; i++) {
263270
DispatchData dispatchData = SetDefault(orgParams, static_cast<int>(i));
264-
auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, params, options);
271+
auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, params, options, i);
265272
auto cldnn_jit = GetJitConstants(orgParams);
266273
cldnn_jit.AddConstant(MakeJitConstant("BUFFER_STRIDE", buffer_stride));
267274

0 commit comments

Comments
 (0)