99//===----------------------------------------------------------------------===//
1010
1111#include "memory.hpp"
12+
1213#include "../ur_interface_loader.hpp"
1314#include "context.hpp"
1415
@@ -53,6 +54,34 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/,
5354 /* nop */
5455}
5556
57+ static v2::raii::command_list_unique_handle
58+ getSyncCommandListForCopy(ur_context_handle_t hContext,
59+ ur_device_handle_t hDevice) {
60+ v2::command_list_desc_t listDesc;
61+ listDesc.IsInOrder = true;
62+ listDesc.Ordinal =
63+ hDevice
64+ ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
65+ .ZeOrdinal;
66+ listDesc.CopyOffloadEnable = true;
67+ return hContext->getCommandListCache().getImmediateCommandList(
68+ hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
69+ ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
70+ }
71+
72+ static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
73+ ur_device_handle_t hDevice, void *dst,
74+ const void *src, size_t size) try {
75+ auto commandList = getSyncCommandListForCopy(hContext, hDevice);
76+
77+ ZE2UR_CALL(zeCommandListAppendMemoryCopy,
78+ (commandList.get(), dst, src, size, nullptr, 0, nullptr));
79+
80+ return UR_RESULT_SUCCESS;
81+ } catch (...) {
82+ return exceptionToResult(std::current_exception());
83+ }
84+
5685ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
5786 ur_context_handle_t hContext, void *hostPtr, size_t size,
5887 device_access_mode_t accessMode)
@@ -68,6 +97,7 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
6897 });
6998 } else {
7099 void *rawPtr;
100+ // Use HOST memory for integrated GPUs to enable zero-copy device access
71101 UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
72102 hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr));
73103
@@ -79,7 +109,12 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
79109 });
80110
81111 if (hostPtr) {
82- std::memcpy(this->ptr.get(), hostPtr, size);
112+ // Initial copy using Level Zero for USM HOST memory
113+ auto hDevice = hContext->getDevices()[0];
114+ UR_CALL_THROWS(
115+ synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size));
116+ // Store writeBackPtr for copy-back - needed when original pointer
117+ // cannot be imported (e.g., misaligned, wrong allocation type)
83118 writeBackPtr = hostPtr;
84119 }
85120 }
@@ -97,12 +132,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
97132 });
98133}
99134
100- ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
101- if (writeBackPtr) {
102- std::memcpy(writeBackPtr, this->ptr.get(), size);
103- }
104- }
105-
106135void *ur_integrated_buffer_handle_t::getDevicePtr(
107136 ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/,
108137 size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/,
@@ -111,48 +140,93 @@ void *ur_integrated_buffer_handle_t::getDevicePtr(
111140}
112141
113142void *ur_integrated_buffer_handle_t::mapHostPtr(
114- ur_map_flags_t /* flags*/ , size_t offset, size_t /*size*/ ,
143+ ur_map_flags_t flags, size_t offset, size_t mapSize ,
115144 ze_command_list_handle_t /*cmdList*/, wait_list_view & /*waitListView*/) {
116- // TODO: if writeBackPtr is set, we should map to that pointer
117- // because that's what SYCL expects, SYCL will attempt to call free
118- // on the resulting pointer leading to double free with the current
119- // implementation. Investigate the SYCL implementation.
145+ if (writeBackPtr) {
146+ // Copy-back path: user gets back their original pointer
147+ void *mappedPtr = ur_cast<char *>(writeBackPtr) + offset;
148+
149+ if (flags & UR_MAP_FLAG_READ) {
150+ // Use Level Zero copy for USM HOST memory to ensure GPU visibility
151+ auto hDevice = hContext->getDevices()[0];
152+ UR_CALL_THROWS(synchronousZeCopy(hContext, hDevice, mappedPtr,
153+ ur_cast<char *>(ptr.get()) + offset,
154+ mapSize));
155+ }
156+
157+ // Track this mapping for unmap
158+ mappedRegions.emplace_back(usm_unique_ptr_t(mappedPtr, [](void *) {}),
159+ mapSize, offset, flags);
160+
161+ return mappedPtr;
162+ }
163+
164+ // Zero-copy path: for successfully imported or USM pointers
120165 return ur_cast<char *>(ptr.get()) + offset;
121166}
122167
123168void ur_integrated_buffer_handle_t::unmapHostPtr(
124- void * /* pMappedPtr*/ , ze_command_list_handle_t /*cmdList*/,
169+ void *pMappedPtr, ze_command_list_handle_t /*cmdList*/,
125170 wait_list_view & /*waitListView*/) {
126- // TODO: if writeBackPtr is set, we should copy the data back
127- /* nop */
128- }
171+ if (writeBackPtr) {
172+ // Copy-back path: find the mapped region and copy data back if needed
173+ auto mappedRegion =
174+ std::find_if(mappedRegions.begin(), mappedRegions.end(),
175+ [pMappedPtr](const host_allocation_desc_t &desc) {
176+ return desc.ptr.get() == pMappedPtr;
177+ });
178+
179+ if (mappedRegion == mappedRegions.end()) {
180+ UR_DFAILURE("could not find pMappedPtr:" << pMappedPtr);
181+ throw UR_RESULT_ERROR_INVALID_ARGUMENT;
182+ }
129183
130- static v2::raii::command_list_unique_handle
131- getSyncCommandListForCopy(ur_context_handle_t hContext,
132- ur_device_handle_t hDevice) {
133- v2::command_list_desc_t listDesc ;
134- listDesc.IsInOrder = true;
135- listDesc.Ordinal =
136- hDevice
137- ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
138- .ZeOrdinal;
139- listDesc.CopyOffloadEnable = true ;
140- return hContext->getCommandListCache().getImmediateCommandList(
141- hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
142- ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
184+ if (mappedRegion->flags &
185+ (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION)) {
186+ // Use Level Zero copy for USM HOST memory to ensure GPU visibility
187+ auto hDevice = hContext->getDevices()[0] ;
188+ UR_CALL_THROWS(synchronousZeCopy(
189+ hContext, hDevice, ur_cast<char *>(ptr.get()) + mappedRegion->offset,
190+ mappedRegion->ptr.get(), mappedRegion->size));
191+ }
192+
193+ mappedRegions.erase(mappedRegion) ;
194+ return;
195+ }
196+ // No op for zero-copy path, memory is synced
143197}
144198
145- static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
146- ur_device_handle_t hDevice, void *dst,
147- const void *src, size_t size) try {
148- auto commandList = getSyncCommandListForCopy(hContext, hDevice);
199+ void ur_integrated_buffer_handle_t::copyBackToHostIfNeeded() {
200+ if (writeBackPtr) {
201+ // Validate that the pointer is still valid before copy-back.
202+ // SYCL might already do its own copy-back and free it.
203+ ZeStruct<ze_memory_allocation_properties_t> memProps;
204+ ze_device_handle_t device;
205+ auto result = ZE_CALL_NOCHECK(
206+ zeMemGetAllocProperties,
207+ (hContext->getZeHandle(), writeBackPtr, &memProps, &device));
208+
209+ // If pointer is not a valid allocation (SYCL freed it), skip copy-back
210+ if (result != ZE_RESULT_SUCCESS ||
211+ memProps.type == ZE_MEMORY_TYPE_UNKNOWN) {
212+ writeBackPtr = nullptr;
213+ return;
214+ }
149215
150- ZE2UR_CALL(zeCommandListAppendMemoryCopy,
151- (commandList.get(), dst, src, size, nullptr, 0, nullptr));
216+ // Pointer is valid, perform copy-back
217+ auto hDevice = hContext->getDevices()[0];
218+ auto result2 = synchronousZeCopy(hContext, hDevice, writeBackPtr,
219+ this->ptr.get(), size);
220+ if (result2 == UR_RESULT_SUCCESS) {
221+ writeBackPtr = nullptr;
222+ } else {
223+ UR_LOG(ERR, "Failed to copy-back buffer data: {}", result2);
224+ }
225+ }
226+ }
152227
153- return UR_RESULT_SUCCESS;
154- } catch (...) {
155- return exceptionToResult(std::current_exception());
228+ ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
229+ copyBackToHostIfNeeded();
156230}
157231
158232void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice,
@@ -410,19 +484,16 @@ void ur_shared_buffer_handle_t::unmapHostPtr(
410484 // nop
411485}
412486
413- static bool useHostBuffer(ur_context_handle_t /* hContext */ ) {
487+ static bool useHostBuffer(ur_context_handle_t hContext) {
414488 // We treat integrated devices (physical memory shared with the CPU)
415489 // differently from discrete devices (those with distinct memories).
416490 // For integrated devices, allocating the buffer in the host memory
417491 // enables automatic access from the device, and makes copying
418492 // unnecessary in the map/unmap operations. This improves performance.
419493
420- // TODO: fix integrated buffer implementation
421- return false;
422-
423- // return hContext->getDevices().size() == 1 &&
424- // hContext->getDevices()[0]->ZeDeviceProperties->flags &
425- // ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
494+ return hContext->getDevices().size() == 1 &&
495+ hContext->getDevices()[0]->ZeDeviceProperties->flags &
496+ ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
426497}
427498
428499ur_mem_sub_buffer_t::ur_mem_sub_buffer_t(ur_mem_handle_t hParent, size_t offset,
@@ -566,6 +637,12 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext,
566637 void *hostPtr = pProperties ? pProperties->pHost : nullptr;
567638 auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags);
568639
640+ // For integrated devices, use zero-copy host buffers. The integrated buffer
641+ // constructor will handle all cases:
642+ // 1. No host pointer - allocate USM host memory
643+ // 2. Host pointer is already USM - use directly
644+ // 3. Host pointer can be imported - import it
645+ // 4. Otherwise - allocate USM and copy-back through map/unmap operations
569646 if (useHostBuffer(hContext)) {
570647 *phBuffer = ur_mem_handle_t_::create<ur_integrated_buffer_handle_t>(
571648 hContext, hostPtr, size, accessMode);
0 commit comments