Skip to content

Commit 59cb380

Browse files
author
iclsrc
committed
Merge from 'sycl' to 'sycl-web' (4 commits)
2 parents c072227 + ed1b620 commit 59cb380

File tree

10 files changed

+195
-77
lines changed

10 files changed

+195
-77
lines changed

.github/workflows/sycl-ur-perf-benchmarking.yml

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,22 +126,24 @@ jobs:
126126
- id: sanitize
127127
run: |
128128
# Validate user inputs:
129-
# usage: check_if_nonempty <var> <regex to check var against> <err message>
130-
check_nonempty() {
129+
# Usage: validate_if_nonempty <var> <regex to check var against> <err message>
130+
# If <var> is empty we skip validation. Either of these vars can be left empty by the user.
131+
validate_if_nonempty() {
131132
[ -z "$1" ] && return
132133
if [ -z "$(echo "$1" | grep -P "$2")" ]; then
133134
echo "$3"
134135
exit 1
135136
fi
136137
}
137-
check_nonempty "$COMMIT_HASH" '^[0-9a-f]{7,}$' "Bad commit hash (or hash short)."
138-
check_nonempty "$PR_NO" '^[0-9]+$' "Bad PR number."
139-
check_nonempty "$SAVE_NAME" '^[A-Za-z][A-Za-z0-9_-]+$' "Bad save name."
138+
validate_if_nonempty "$COMMIT_HASH" '^[0-9a-f]{7,}$' "Bad commit hash (or hash short)."
139+
validate_if_nonempty "$PR_NO" '^[0-9]+$' "Bad PR number."
140+
validate_if_nonempty "$SAVE_NAME" '^[A-Za-z][A-Za-z0-9_-]+$' "Bad save name. Use only alphanumerics, dash, underscore, and start with a letter."
141+
validate_if_nonempty "$SAVE_NAME" '^(?![Bb]+aseline$)' "Save name cannot be 'Baseline' nor 'baseline' - it's reserved for nightly."
140142

141143
BENCHMARK_SAVE_NAME=""
142-
BUILD_REF="${{ github.ref }}"
144+
BUILD_REF="$GITHUB_REF"
143145
if [ -n "$SAVE_NAME" ]; then
144-
BENCHMARK_SAVE_NAME="$(echo "$SAVE_NAME" | tr -cd 'A-Za-z0-9_-')"
146+
BENCHMARK_SAVE_NAME="$SAVE_NAME"
145147
fi;
146148
if [ -n "$COMMIT_HASH" ]; then
147149
echo "Using commit hash $COMMIT_HASH for build..."
@@ -153,7 +155,8 @@ jobs:
153155
BUILD_REF="refs/pull/$PR_NO/head"
154156
[ -z "$BENCHMARK_SAVE_NAME" ] && BENCHMARK_SAVE_NAME="PR_${PR_NO}"
155157
fi
156-
[ -z "$BENCHMARK_SAVE_NAME" ] && BENCHMARK_SAVE_NAME="Baseline"
158+
# Fall back to ref_name that triggered the workflow:
159+
[ -z "$BENCHMARK_SAVE_NAME" ] && BENCHMARK_SAVE_NAME="$GITHUB_REF_NAME"
157160

158161
echo "benchmark_save_name=$BENCHMARK_SAVE_NAME" >> $GITHUB_OUTPUT
159162
echo "build_ref=$BUILD_REF" >> $GITHUB_OUTPUT

sycl/source/detail/device_image_impl.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,9 @@ class device_image_impl
546546
}
547547

548548
ur_program_handle_t get_ur_program() const noexcept { return MProgram; }
549+
void set_ur_program(Managed<ur_program_handle_t> &&Program) {
550+
MProgram = std::move(Program);
551+
}
549552

550553
const RTDeviceBinaryImage *const &get_bin_image_ref() const {
551554
return std::get<const RTDeviceBinaryImage *>(MBinImage);

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2292,8 +2292,12 @@ ProgramManager::createDependencyImage(const context &Ctx, devices_range Devs,
22922292
if (DepIt != m_BinImg2KernelIDs.end())
22932293
DepKernelIDs = DepIt->second;
22942294
}
2295-
2296-
assert(DepState == getBinImageState(DepImage) &&
2295+
// The only difference between object and executable images is whether they
2296+
// have external dependencies, so executable images are valid dependencies for
2297+
// object ones.
2298+
assert((DepState == getBinImageState(DepImage) ||
2299+
(DepState == bundle_state::object &&
2300+
getBinImageState(DepImage) == bundle_state::executable)) &&
22972301
"State mismatch between main image and its dependency");
22982302

22992303
return createSyclObjFromImpl<device_image_plain>(device_image_impl::create(
@@ -2568,7 +2572,6 @@ ProgramManager::link(device_images_range Imgs, devices_range Devs,
25682572
PropList, NoAllowedPropertiesCheck, NoAllowedPropertiesCheck);
25692573
}
25702574

2571-
auto URPrograms = Imgs.to<std::vector<ur_program_handle_t>>();
25722575
auto URDevices = Devs.to<std::vector<ur_device_handle_t>>();
25732576

25742577
// FIXME: Linker options are picked from the first object, but is that safe?
@@ -2584,6 +2587,16 @@ ProgramManager::link(device_images_range Imgs, devices_range Devs,
25842587
context_impl &ContextImpl = *getSyclObjImpl(Context);
25852588
adapter_impl &Adapter = ContextImpl.getAdapter();
25862589

2590+
// We create UR programs lazily, so device_images that started off in the
2591+
// object state might not have a UR program associated with them.
2592+
for (auto &Img : Imgs) {
2593+
if (Img.get_ur_program() == nullptr) {
2594+
Img.set_ur_program(
2595+
createURProgram(*Img.get_bin_image_ref(), ContextImpl, Devs));
2596+
}
2597+
}
2598+
auto URPrograms = Imgs.to<std::vector<ur_program_handle_t>>();
2599+
25872600
ur_exp_program_flags_t UrLinkFlags{};
25882601
if (AllowUnresolvedSymbols)
25892602
UrLinkFlags &= UR_EXP_PROGRAM_FLAG_ALLOW_UNRESOLVED_SYMBOLS;

sycl/test-e2e/Adapters/level_zero/interop-get-native-mem.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,6 @@
77
// RUN: %{build} %level_zero_options -o %t.out
88
// RUN: %{run} %t.out
99

10-
// L0v2 adapter does not support integrated buffers yet
11-
// UNSUPPORTED: level_zero_v2_adapter
12-
// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20280
13-
1410
// Test get_native_mem for the Level Zero backend.
1511

1612
// Level-Zero

sycl/test-e2e/Basic/buffer/buffer_create.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,6 @@
88
// RUN: %{run} %t.out 2>&1 | FileCheck %s
99
// UNSUPPORTED: ze_debug
1010

11-
// L0v2 adapter doesn't optimize buffer creation based on device type yet
12-
// (integrated buffer implementation needs more work).
13-
// UNSUPPORTED: level_zero_v2_adapter
14-
// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20121
15-
1611
#include <iostream>
1712
#include <level_zero/ze_api.h>
1813
#include <sycl/detail/core.hpp>

sycl/test-e2e/DeviceImageDependencies/Inputs/basic.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ int main() {
3737
cgh.single_task<Kernel<2>>([=]() { acc[0] = levelA(acc[0]); });
3838
});
3939
});
40+
#ifndef USE_AOT
4041
runTest(q, [](queue &q, buffer<int, 1> &buf) {
4142
kernel_bundle KBInput = get_kernel_bundle<sycl::bundle_state::input>(
4243
q.get_context(), {get_kernel_id<Kernel<3>>()});
@@ -48,4 +49,5 @@ int main() {
4849
cgh.single_task<Kernel<3>>([=]() { acc[0] = levelA(acc[0]); });
4950
});
5051
});
52+
#endif
5153
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Test -fsycl-allow-device-image-dependencies with dynamic libraries and AOT.
2+
3+
// REQUIRES: ocloc, gpu, target_spir
4+
5+
// DEFINE: %{aot_options} = -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts -DUSE_AOT
6+
// DEFINE: %{dynamic_lib_options} = %{aot_options} %fPIC %shared_lib -fsycl-allow-device-image-dependencies -I %S/Inputs %if windows %{-DMAKE_DLL %}
7+
// DEFINE: %{dynamic_lib_suffix} = %if windows %{dll%} %else %{so%}
8+
9+
// RUN: rm -rf %t.dir; mkdir -p %t.dir
10+
// RUN: %clangxx %{dynamic_lib_options} %S/Inputs/d.cpp -o %t.dir/libdevice_d.%{dynamic_lib_suffix}
11+
// RUN: %clangxx %{dynamic_lib_options} %S/Inputs/c.cpp %if windows %{%t.dir/libdevice_d.lib%} -o %t.dir/libdevice_c.%{dynamic_lib_suffix}
12+
// RUN: %clangxx %{dynamic_lib_options} %S/Inputs/b.cpp %if windows %{%t.dir/libdevice_c.lib%} -o %t.dir/libdevice_b.%{dynamic_lib_suffix}
13+
// RUN: %clangxx %{dynamic_lib_options} %S/Inputs/a.cpp %if windows %{%t.dir/libdevice_b.lib%} -o %t.dir/libdevice_a.%{dynamic_lib_suffix}
14+
15+
// RUN: %clangxx -fsycl %{aot_options} -fsycl-allow-device-image-dependencies -fsycl-device-code-split=per_kernel %S/Inputs/basic.cpp -o %t.dir/%{t:stem}.out \
16+
// RUN: %if windows \
17+
// RUN: %{%t.dir/libdevice_a.lib%} \
18+
// RUN: %else \
19+
// RUN: %{-L%t.dir -ldevice_a -ldevice_b -ldevice_c -ldevice_d -Wl,-rpath=%t.dir%}
20+
21+
// RUN: %{run} %t.dir/%{t:stem}.out
22+
23+
// Remove once the Level Zero runtime supports native binaries in the program
24+
// module extension.
25+
// UNSUPPORTED: level_zero

unified-runtime/source/adapters/level_zero/v2/memory.cpp

Lines changed: 122 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "memory.hpp"
12+
1213
#include "../ur_interface_loader.hpp"
1314
#include "context.hpp"
1415

@@ -53,6 +54,34 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/,
5354
/* nop */
5455
}
5556

57+
static v2::raii::command_list_unique_handle
58+
getSyncCommandListForCopy(ur_context_handle_t hContext,
59+
ur_device_handle_t hDevice) {
60+
v2::command_list_desc_t listDesc;
61+
listDesc.IsInOrder = true;
62+
listDesc.Ordinal =
63+
hDevice
64+
->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
65+
.ZeOrdinal;
66+
listDesc.CopyOffloadEnable = true;
67+
return hContext->getCommandListCache().getImmediateCommandList(
68+
hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
69+
ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
70+
}
71+
72+
static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
73+
ur_device_handle_t hDevice, void *dst,
74+
const void *src, size_t size) try {
75+
auto commandList = getSyncCommandListForCopy(hContext, hDevice);
76+
77+
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
78+
(commandList.get(), dst, src, size, nullptr, 0, nullptr));
79+
80+
return UR_RESULT_SUCCESS;
81+
} catch (...) {
82+
return exceptionToResult(std::current_exception());
83+
}
84+
5685
ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
5786
ur_context_handle_t hContext, void *hostPtr, size_t size,
5887
device_access_mode_t accessMode)
@@ -68,6 +97,7 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
6897
});
6998
} else {
7099
void *rawPtr;
100+
// Use HOST memory for integrated GPUs to enable zero-copy device access
71101
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
72102
hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr));
73103

@@ -79,7 +109,12 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
79109
});
80110

81111
if (hostPtr) {
82-
std::memcpy(this->ptr.get(), hostPtr, size);
112+
// Initial copy using Level Zero for USM HOST memory
113+
auto hDevice = hContext->getDevices()[0];
114+
UR_CALL_THROWS(
115+
synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size));
116+
// Store writeBackPtr for copy-back - needed when original pointer
117+
// cannot be imported (e.g., misaligned, wrong allocation type)
83118
writeBackPtr = hostPtr;
84119
}
85120
}
@@ -97,12 +132,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
97132
});
98133
}
99134

100-
ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
101-
if (writeBackPtr) {
102-
std::memcpy(writeBackPtr, this->ptr.get(), size);
103-
}
104-
}
105-
106135
void *ur_integrated_buffer_handle_t::getDevicePtr(
107136
ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/,
108137
size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/,
@@ -111,48 +140,93 @@ void *ur_integrated_buffer_handle_t::getDevicePtr(
111140
}
112141

113142
void *ur_integrated_buffer_handle_t::mapHostPtr(
114-
ur_map_flags_t /*flags*/, size_t offset, size_t /*size*/,
143+
ur_map_flags_t flags, size_t offset, size_t mapSize,
115144
ze_command_list_handle_t /*cmdList*/, wait_list_view & /*waitListView*/) {
116-
// TODO: if writeBackPtr is set, we should map to that pointer
117-
// because that's what SYCL expects, SYCL will attempt to call free
118-
// on the resulting pointer leading to double free with the current
119-
// implementation. Investigate the SYCL implementation.
145+
if (writeBackPtr) {
146+
// Copy-back path: user gets back their original pointer
147+
void *mappedPtr = ur_cast<char *>(writeBackPtr) + offset;
148+
149+
if (flags & UR_MAP_FLAG_READ) {
150+
// Use Level Zero copy for USM HOST memory to ensure GPU visibility
151+
auto hDevice = hContext->getDevices()[0];
152+
UR_CALL_THROWS(synchronousZeCopy(hContext, hDevice, mappedPtr,
153+
ur_cast<char *>(ptr.get()) + offset,
154+
mapSize));
155+
}
156+
157+
// Track this mapping for unmap
158+
mappedRegions.emplace_back(usm_unique_ptr_t(mappedPtr, [](void *) {}),
159+
mapSize, offset, flags);
160+
161+
return mappedPtr;
162+
}
163+
164+
// Zero-copy path: for successfully imported or USM pointers
120165
return ur_cast<char *>(ptr.get()) + offset;
121166
}
122167

123168
void ur_integrated_buffer_handle_t::unmapHostPtr(
124-
void * /*pMappedPtr*/, ze_command_list_handle_t /*cmdList*/,
169+
void *pMappedPtr, ze_command_list_handle_t /*cmdList*/,
125170
wait_list_view & /*waitListView*/) {
126-
// TODO: if writeBackPtr is set, we should copy the data back
127-
/* nop */
128-
}
171+
if (writeBackPtr) {
172+
// Copy-back path: find the mapped region and copy data back if needed
173+
auto mappedRegion =
174+
std::find_if(mappedRegions.begin(), mappedRegions.end(),
175+
[pMappedPtr](const host_allocation_desc_t &desc) {
176+
return desc.ptr.get() == pMappedPtr;
177+
});
178+
179+
if (mappedRegion == mappedRegions.end()) {
180+
UR_DFAILURE("could not find pMappedPtr:" << pMappedPtr);
181+
throw UR_RESULT_ERROR_INVALID_ARGUMENT;
182+
}
129183

130-
static v2::raii::command_list_unique_handle
131-
getSyncCommandListForCopy(ur_context_handle_t hContext,
132-
ur_device_handle_t hDevice) {
133-
v2::command_list_desc_t listDesc;
134-
listDesc.IsInOrder = true;
135-
listDesc.Ordinal =
136-
hDevice
137-
->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
138-
.ZeOrdinal;
139-
listDesc.CopyOffloadEnable = true;
140-
return hContext->getCommandListCache().getImmediateCommandList(
141-
hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
142-
ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
184+
if (mappedRegion->flags &
185+
(UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION)) {
186+
// Use Level Zero copy for USM HOST memory to ensure GPU visibility
187+
auto hDevice = hContext->getDevices()[0];
188+
UR_CALL_THROWS(synchronousZeCopy(
189+
hContext, hDevice, ur_cast<char *>(ptr.get()) + mappedRegion->offset,
190+
mappedRegion->ptr.get(), mappedRegion->size));
191+
}
192+
193+
mappedRegions.erase(mappedRegion);
194+
return;
195+
}
196+
// No op for zero-copy path, memory is synced
143197
}
144198

145-
static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
146-
ur_device_handle_t hDevice, void *dst,
147-
const void *src, size_t size) try {
148-
auto commandList = getSyncCommandListForCopy(hContext, hDevice);
199+
void ur_integrated_buffer_handle_t::copyBackToHostIfNeeded() {
200+
if (writeBackPtr) {
201+
// Validate that the pointer is still valid before copy-back.
202+
// SYCL might already do its own copy-back and free it.
203+
ZeStruct<ze_memory_allocation_properties_t> memProps;
204+
ze_device_handle_t device;
205+
auto result = ZE_CALL_NOCHECK(
206+
zeMemGetAllocProperties,
207+
(hContext->getZeHandle(), writeBackPtr, &memProps, &device));
208+
209+
// If pointer is not a valid allocation (SYCL freed it), skip copy-back
210+
if (result != ZE_RESULT_SUCCESS ||
211+
memProps.type == ZE_MEMORY_TYPE_UNKNOWN) {
212+
writeBackPtr = nullptr;
213+
return;
214+
}
149215

150-
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
151-
(commandList.get(), dst, src, size, nullptr, 0, nullptr));
216+
// Pointer is valid, perform copy-back
217+
auto hDevice = hContext->getDevices()[0];
218+
auto result2 = synchronousZeCopy(hContext, hDevice, writeBackPtr,
219+
this->ptr.get(), size);
220+
if (result2 == UR_RESULT_SUCCESS) {
221+
writeBackPtr = nullptr;
222+
} else {
223+
UR_LOG(ERR, "Failed to copy-back buffer data: {}", result2);
224+
}
225+
}
226+
}
152227

153-
return UR_RESULT_SUCCESS;
154-
} catch (...) {
155-
return exceptionToResult(std::current_exception());
228+
ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
229+
copyBackToHostIfNeeded();
156230
}
157231

158232
void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice,
@@ -410,19 +484,16 @@ void ur_shared_buffer_handle_t::unmapHostPtr(
410484
// nop
411485
}
412486

413-
static bool useHostBuffer(ur_context_handle_t /* hContext */) {
487+
static bool useHostBuffer(ur_context_handle_t hContext) {
414488
// We treat integrated devices (physical memory shared with the CPU)
415489
// differently from discrete devices (those with distinct memories).
416490
// For integrated devices, allocating the buffer in the host memory
417491
// enables automatic access from the device, and makes copying
418492
// unnecessary in the map/unmap operations. This improves performance.
419493

420-
// TODO: fix integrated buffer implementation
421-
return false;
422-
423-
// return hContext->getDevices().size() == 1 &&
424-
// hContext->getDevices()[0]->ZeDeviceProperties->flags &
425-
// ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
494+
return hContext->getDevices().size() == 1 &&
495+
hContext->getDevices()[0]->ZeDeviceProperties->flags &
496+
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
426497
}
427498

428499
ur_mem_sub_buffer_t::ur_mem_sub_buffer_t(ur_mem_handle_t hParent, size_t offset,
@@ -566,6 +637,12 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext,
566637
void *hostPtr = pProperties ? pProperties->pHost : nullptr;
567638
auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags);
568639

640+
// For integrated devices, use zero-copy host buffers. The integrated buffer
641+
// constructor will handle all cases:
642+
// 1. No host pointer - allocate USM host memory
643+
// 2. Host pointer is already USM - use directly
644+
// 3. Host pointer can be imported - import it
645+
// 4. Otherwise - allocate USM and copy-back through map/unmap operations
569646
if (useHostBuffer(hContext)) {
570647
*phBuffer = ur_mem_handle_t_::create<ur_integrated_buffer_handle_t>(
571648
hContext, hostPtr, size, accessMode);

0 commit comments

Comments
 (0)