openvinotoolkit · dnkurek · Jan 13, 2025
@@ -160,6 +160,7 @@ struct network {
     std::map<primitive_id, network_output> execute(const std::vector<event::ptr>& dependencies = {});
 
     void validate_primitives();
+    void preallocate_shape_info_buffers();
     void set_arguments();
     // Implementation specific calls
     bool does_node_need_lockable_output(const primitive_id& id) const;
@@ -220,6 +221,9 @@ struct network {
     bool _reset_arguments;
     bool _reuse_variable_mem = false;
 
+    /* Common memory pointer for shape_info */
+    memory::ptr _shape_info_ptr;
+
     std::unordered_map<primitive_id, std::shared_ptr<primitive_inst>> _primitives;
     std::vector<shared_mem_type> _in_out_shared_mem_types;
     std::vector<std::shared_ptr<primitive_inst>> _inputs;

@@ -57,6 +57,9 @@ class engine {
     /// Created memory object from memory @p params and reinterpred the data using specified @p layout
     virtual memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) = 0;
 
+    /// Created subbuffer memory object from the other @p memory and reinterpred the data using specified @p new_layout
+    virtual memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) = 0;
+
     /// Created memory object from the other @p memory and reinterpred the data using specified @p new_layout
     virtual memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) = 0;
 

@@ -232,6 +232,7 @@ class primitive_inst {
     }
 
     memory::ptr shape_info_memory_ptr() const { return _shape_info_memory; }
+    void set_shape_info_memory_ptr(memory::ptr addr);
 
     void add_dep_events(const std::vector<event::ptr>& events);
     void add_dep_event(event::ptr ev);

@@ -207,6 +207,7 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
     build_insts_deps();
     build_exec_order();
     validate_primitives();
+    preallocate_shape_info_buffers();
     add_default_output_chains();
 }
 
@@ -275,6 +276,41 @@ void network::validate_primitives() {
     }
 }
 
+void network::preallocate_shape_info_buffers() {
+    GPU_DEBUG_DEFINE_MEM_LOGGER("preallocate_shape_info_buffers");
+    int64_t sum = 0;
+
+    /* Use 512 byte alignment for performance */
+    uint64_t alignment = 512;
+
+    for (auto const& prim : _exec_order) {
+        auto& node = prim->get_node();
+        int64_t shape_elements = node.get_total_shape_info_size();
+
+        shape_elements = (shape_elements + alignment - 1) / alignment * alignment;
+        sum += shape_elements;
+    }
+
+    if (sum == 0)
+        return;
+
+    auto& engine = get_engine();
+    _shape_info_ptr = engine.allocate_memory(layout{{sum}, data_types::i32, format::bfyx}, false);
+    int offset = 0;
+    for (auto const& prim : _exec_order) {
+        auto& node = prim->get_node();
+        int64_t shape_elements = node.get_total_shape_info_size();
+
+        if (shape_elements == 0)
+            continue;
+
+        auto new_mem = engine.create_subbuffer(*_shape_info_ptr, layout{{shape_elements}, data_types::i32, format::bfyx}, offset);
+        prim->set_shape_info_memory_ptr(new_mem);
+
+        offset += (shape_elements + alignment - 1) / alignment * alignment * 4;
+    }
+}
+
 void network::set_arguments() {
     if (!_reset_arguments)
         return;

@@ -1146,6 +1146,10 @@ void primitive_inst::fill_shape_info_data(const layout& runtime_layout, const la
     }
 }
 
+void primitive_inst::set_shape_info_memory_ptr(memory::ptr addr) {
+    _shape_info_memory = addr;
+}
+
 void primitive_inst::allocate_shape_info_memory() {
     int64_t shape_elements = _node->get_total_shape_info_size();
     _shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}, false);

@@ -207,7 +207,36 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
         }
     }
 }
+memory::ptr ocl_engine::create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) {
+    OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to create a subbuffer from a buffer allocated by a different engine");
+    try {
+        if (new_layout.format.is_image_2d()) {
+            OPENVINO_NOT_IMPLEMENTED;
+        } else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
+            auto& new_buf = reinterpret_cast<const ocl::gpu_usm&>(memory);
+            auto ptr = new_buf.get_buffer().get();
+            auto sub_buffer = cl::UsmMemory(get_usm_helper(), ptr, offset);
+
+            return std::make_shared<ocl::gpu_usm>(this,
+                                     new_layout,
+                                     sub_buffer,
+                                     memory.get_allocation_type(),
+                                     memory.get_mem_tracker());
+        } else {
+            auto buffer = reinterpret_cast<const ocl::gpu_buffer&>(memory).get_buffer();
+            cl_buffer_region sub_buffer_region = { offset, new_layout.get_linear_size() };
+            auto sub_buffer = buffer.createSubBuffer(CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,
+                            CL_BUFFER_CREATE_TYPE_REGION, &sub_buffer_region);
 
+            return std::make_shared<ocl::gpu_buffer>(this,
+                                     new_layout,
+                                     sub_buffer,
+                                     memory.get_mem_tracker());
+        }
+    } catch (cl::Error const& err) {
+        OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
+    }
+}
 memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) {
     OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine");
     OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(),
@@ -221,7 +250,7 @@ memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& n
                                      reinterpret_cast<const ocl::gpu_image2d&>(memory).get_buffer(),
                                      memory.get_mem_tracker());
         } else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
-            return std::make_shared<ocl::gpu_usm>(this,
+           return std::make_shared<ocl::gpu_usm>(this,
                                      new_layout,
                                      reinterpret_cast<const ocl::gpu_usm&>(memory).get_buffer(),
                                      memory.get_allocation_type(),

@@ -26,6 +26,7 @@ class ocl_engine : public engine {
 
     memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override;
     memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override;
+    memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override;
     memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override;
     bool is_the_same_buffer(const memory& mem1, const memory& mem2) override;
     bool check_allocatable(const layout& layout, allocation_type type) override;

@@ -928,9 +928,9 @@ class UsmHolder {
 class UsmMemory {
 public:
     explicit UsmMemory(const cl::UsmHelper& usmHelper) : _usmHelper(usmHelper) { }
-    UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr)
+    UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr, size_t offset = 0)
     : _usmHelper(usmHelper)
-    , _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, usm_ptr, true)) {
+    , _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, reinterpret_cast<char*>(usm_ptr) + offset, true)) {
         if (!usm_ptr) {
             throw std::runtime_error("[GPU] Can't share null usm pointer");
         }