Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU][PoC] Common shape info buffer #28167

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ struct network {
std::map<primitive_id, network_output> execute(const std::vector<event::ptr>& dependencies = {});

void validate_primitives();
void preallocate_shape_info_buffers();
void set_arguments();
// Implementation specific calls
bool does_node_need_lockable_output(const primitive_id& id) const;
Expand Down Expand Up @@ -220,6 +221,9 @@ struct network {
bool _reset_arguments;
bool _reuse_variable_mem = false;

/* Common memory pointer for shape_info */
memory::ptr _shape_info_ptr;

std::unordered_map<primitive_id, std::shared_ptr<primitive_inst>> _primitives;
std::vector<shared_mem_type> _in_out_shared_mem_types;
std::vector<std::shared_ptr<primitive_inst>> _inputs;
Expand Down
3 changes: 3 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ class engine {
/// Created memory object from memory @p params and reinterpred the data using specified @p layout
virtual memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) = 0;

/// Created subbuffer memory object from the other @p memory and reinterpred the data using specified @p new_layout
virtual memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) = 0;

/// Created memory object from the other @p memory and reinterpred the data using specified @p new_layout
virtual memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) = 0;

Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ class primitive_inst {
}

memory::ptr shape_info_memory_ptr() const { return _shape_info_memory; }
void set_shape_info_memory_ptr(memory::ptr addr);

void add_dep_events(const std::vector<event::ptr>& events);
void add_dep_event(event::ptr ev);
Expand Down
36 changes: 36 additions & 0 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
build_insts_deps();
build_exec_order();
validate_primitives();
preallocate_shape_info_buffers();
add_default_output_chains();
}

Expand Down Expand Up @@ -275,6 +276,41 @@ void network::validate_primitives() {
}
}

void network::preallocate_shape_info_buffers() {
GPU_DEBUG_DEFINE_MEM_LOGGER("preallocate_shape_info_buffers");
int64_t sum = 0;

/* Use 512 byte alignment for performance */
uint64_t alignment = 512;

for (auto const& prim : _exec_order) {
auto& node = prim->get_node();
int64_t shape_elements = node.get_total_shape_info_size();

shape_elements = (shape_elements + alignment - 1) / alignment * alignment;
sum += shape_elements;
}

if (sum == 0)
return;

auto& engine = get_engine();
_shape_info_ptr = engine.allocate_memory(layout{{sum}, data_types::i32, format::bfyx}, false);
int offset = 0;
for (auto const& prim : _exec_order) {
auto& node = prim->get_node();
int64_t shape_elements = node.get_total_shape_info_size();

if (shape_elements == 0)
continue;

auto new_mem = engine.create_subbuffer(*_shape_info_ptr, layout{{shape_elements}, data_types::i32, format::bfyx}, offset);
prim->set_shape_info_memory_ptr(new_mem);

offset += (shape_elements + alignment - 1) / alignment * alignment * 4;
}
}

void network::set_arguments() {
if (!_reset_arguments)
return;
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1146,6 +1146,10 @@ void primitive_inst::fill_shape_info_data(const layout& runtime_layout, const la
}
}

void primitive_inst::set_shape_info_memory_ptr(memory::ptr addr) {
_shape_info_memory = addr;
}

void primitive_inst::allocate_shape_info_memory() {
int64_t shape_elements = _node->get_total_shape_info_size();
_shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}, false);
Expand Down
31 changes: 30 additions & 1 deletion src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,36 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
}
}
}
memory::ptr ocl_engine::create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) {
OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to create a subbuffer from a buffer allocated by a different engine");
try {
if (new_layout.format.is_image_2d()) {
OPENVINO_NOT_IMPLEMENTED;
} else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
auto& new_buf = reinterpret_cast<const ocl::gpu_usm&>(memory);
auto ptr = new_buf.get_buffer().get();
auto sub_buffer = cl::UsmMemory(get_usm_helper(), ptr, offset);

return std::make_shared<ocl::gpu_usm>(this,
new_layout,
sub_buffer,
memory.get_allocation_type(),
memory.get_mem_tracker());
} else {
auto buffer = reinterpret_cast<const ocl::gpu_buffer&>(memory).get_buffer();
cl_buffer_region sub_buffer_region = { offset, new_layout.get_linear_size() };
auto sub_buffer = buffer.createSubBuffer(CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,
CL_BUFFER_CREATE_TYPE_REGION, &sub_buffer_region);

return std::make_shared<ocl::gpu_buffer>(this,
new_layout,
sub_buffer,
memory.get_mem_tracker());
}
} catch (cl::Error const& err) {
OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
}
}
memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) {
OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine");
OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(),
Expand All @@ -221,7 +250,7 @@ memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& n
reinterpret_cast<const ocl::gpu_image2d&>(memory).get_buffer(),
memory.get_mem_tracker());
} else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
return std::make_shared<ocl::gpu_usm>(this,
return std::make_shared<ocl::gpu_usm>(this,
new_layout,
reinterpret_cast<const ocl::gpu_usm&>(memory).get_buffer(),
memory.get_allocation_type(),
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class ocl_engine : public engine {

memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override;
memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override;
memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override;
memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override;
bool is_the_same_buffer(const memory& mem1, const memory& mem2) override;
bool check_allocatable(const layout& layout, allocation_type type) override;
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -928,9 +928,9 @@ class UsmHolder {
class UsmMemory {
public:
explicit UsmMemory(const cl::UsmHelper& usmHelper) : _usmHelper(usmHelper) { }
UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr)
UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr, size_t offset = 0)
: _usmHelper(usmHelper)
, _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, usm_ptr, true)) {
, _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, reinterpret_cast<char*>(usm_ptr) + offset, true)) {
if (!usm_ptr) {
throw std::runtime_error("[GPU] Can't share null usm pointer");
}
Expand Down
Loading