diff --git a/unified-runtime/source/adapters/level_zero/device.cpp b/unified-runtime/source/adapters/level_zero/device.cpp index da7de39f0bc0..aada65227bb7 100644 --- a/unified-runtime/source/adapters/level_zero/device.cpp +++ b/unified-runtime/source/adapters/level_zero/device.cpp @@ -1082,7 +1082,11 @@ ur_result_t urDeviceGetInfo( return ReturnValue(UpdateCapabilities); } case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: +#ifdef UR_ADAPTER_LEVEL_ZERO_V2 + return ReturnValue(true); +#else return ReturnValue(false); +#endif case UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP: return ReturnValue(false); case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: { diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index a6541cff99ad..b9f2f8cd335f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -69,7 +69,7 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( : commandListManager( context, device, std::forward(commandList), - v2::EVENT_FLAGS_COUNTER, nullptr), + v2::EVENT_FLAGS_COUNTER, nullptr, false), isUpdatable(desc ? desc->isUpdatable : false), context(context), device(device) {} @@ -102,10 +102,17 @@ ur_result_t ur_exp_command_buffer_handle_t_::finalizeCommandBuffer() { isFinalized = true; return UR_RESULT_SUCCESS; } + ur_event_handle_t ur_exp_command_buffer_handle_t_::getExecutionEventUnlocked() { return currentExecution; } +void ur_exp_command_buffer_handle_t_::enableEvents() { + for (auto &event : addedEvents) { + event->markEventAsInUse(); + } + addedEvents.clear(); +} ur_result_t ur_exp_command_buffer_handle_t_::registerExecutionEventUnlocked( ur_event_handle_t nextExecutionEvent) { if (currentExecution) { @@ -158,6 +165,10 @@ ur_result_t ur_exp_command_buffer_handle_t_::applyUpdateCommands( return UR_RESULT_SUCCESS; } +void ur_exp_command_buffer_handle_t_::registerEvent(ur_event_handle_t event) { + addedEvents.push_back(event); + event->markEventAsNotInUse(); +} namespace ur::level_zero { ur_result_t @@ -226,10 +237,9 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( uint32_t numKernelAlternatives, ur_kernel_handle_t *kernelAlternatives, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*syncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*eventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *eventWaitList, ur_exp_command_buffer_sync_point_t * /*retSyncPoint*/, - ur_event_handle_t * /*event*/, + ur_event_handle_t *event, ur_exp_command_buffer_command_handle_t *command) try { if (command != nullptr && !commandBuffer->isUpdatable) { @@ -247,8 +257,11 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( numKernelAlternatives, kernelAlternatives, command)); } UR_CALL(commandListLocked->appendKernelLaunch( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, - nullptr, nullptr)); + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, eventWaitList, event)); + if (event != nullptr) { + commandBuffer->registerEvent(*event); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -258,17 +271,19 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp( ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, size_t size, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMMemcpy(false, pDst, pSrc, size, 0, - nullptr, nullptr)); + UR_CALL(commandListLocked->appendUSMMemcpy( + false, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -279,10 +294,9 @@ ur_result_t urCommandBufferAppendMemBufferCopyExp( ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp @@ -290,8 +304,12 @@ ur_result_t urCommandBufferAppendMemBufferCopyExp( // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); UR_CALL(commandListLocked->appendMemBufferCopy( - hSrcMem, hDstMem, srcOffset, dstOffset, size, 0, nullptr, nullptr)); + hSrcMem, hDstMem, srcOffset, dstOffset, size, numEventsInWaitList, + phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -302,10 +320,9 @@ ur_result_t urCommandBufferAppendMemBufferWriteExp( size_t offset, size_t size, const void *pSrc, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp @@ -313,8 +330,12 @@ ur_result_t urCommandBufferAppendMemBufferWriteExp( // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); UR_CALL(commandListLocked->appendMemBufferWrite(hBuffer, false, offset, size, - pSrc, 0, nullptr, nullptr)); + pSrc, numEventsInWaitList, + phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -325,18 +346,21 @@ ur_result_t urCommandBufferAppendMemBufferReadExp( size_t offset, size_t size, void *pDst, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); UR_CALL(commandListLocked->appendMemBufferRead(hBuffer, false, offset, size, - pDst, 0, nullptr, nullptr)); + pDst, numEventsInWaitList, + phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -349,10 +373,9 @@ ur_result_t urCommandBufferAppendMemBufferCopyRectExp( size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp @@ -361,8 +384,12 @@ ur_result_t urCommandBufferAppendMemBufferCopyRectExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); UR_CALL(commandListLocked->appendMemBufferCopyRect( hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, 0, nullptr, nullptr)); + srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, + phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -375,10 +402,9 @@ ur_result_t urCommandBufferAppendMemBufferWriteRectExp( size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp @@ -387,9 +413,12 @@ ur_result_t urCommandBufferAppendMemBufferWriteRectExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); UR_CALL(commandListLocked->appendMemBufferWriteRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, 0, nullptr, - nullptr)); + bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, numEventsInWaitList, + phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -402,10 +431,9 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp @@ -414,9 +442,12 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); UR_CALL(commandListLocked->appendMemBufferReadRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, 0, nullptr, - nullptr)); + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, numEventsInWaitList, + phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -427,15 +458,18 @@ ur_result_t urCommandBufferAppendUSMFillExp( const void *pPattern, size_t patternSize, size_t size, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { auto commandListLocked = hCommandBuffer->commandListManager.lock(); UR_CALL(commandListLocked->appendUSMFill(pMemory, patternSize, pPattern, size, - 0, nullptr, nullptr)); + numEventsInWaitList, phEventWaitList, + phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -446,16 +480,19 @@ ur_result_t urCommandBufferAppendMemBufferFillExp( const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp auto commandListLocked = hCommandBuffer->commandListManager.lock(); UR_CALL(commandListLocked->appendMemBufferFill( - hBuffer, pPattern, patternSize, offset, size, 0, nullptr, nullptr)); + hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, + phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -466,17 +503,19 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( size_t size, ur_usm_migration_flags_t flags, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMPrefetch(pMemory, size, flags, 0, nullptr, - nullptr)); + UR_CALL(commandListLocked->appendUSMPrefetch( + pMemory, size, flags, numEventsInWaitList, phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { @@ -488,21 +527,23 @@ ur_result_t urCommandBufferAppendUSMAdviseExp( size_t size, ur_usm_advice_flags_t advice, uint32_t /*numSyncPointsInWaitList*/, const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, - ur_event_handle_t * /*phEvent*/, + ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMAdvise(pMemory, size, advice, nullptr)); + UR_CALL(commandListLocked->appendUSMAdvise( + pMemory, size, advice, numEventsInWaitList, phEventWaitList, phEvent)); + if (phEvent != nullptr) { + hCommandBuffer->registerEvent(*phEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); } - ur_result_t urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.hpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.hpp index 91f7df69c3d0..defde01c725a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.hpp @@ -51,6 +51,9 @@ struct ur_exp_command_buffer_handle_t_ : public ur_object { uint32_t numUpdateCommands, const ur_exp_command_buffer_update_kernel_launch_desc_t *updateCommands); + void enableEvents(); + void registerEvent(ur_event_handle_t event); + private: const ur_context_handle_t context; const ur_device_handle_t device; @@ -60,4 +63,5 @@ struct ur_exp_command_buffer_handle_t_ : public ur_object { bool isFinalized = false; ur_event_handle_t currentExecution = nullptr; + std::vector addedEvents; }; diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index d6f865d80b5c..34755783eb1e 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -18,14 +18,16 @@ ur_command_list_manager::ur_command_list_manager( ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, v2::event_flags_t flags, - ur_queue_t_ *queue) - : context(context), device(device), - eventPool(context->getEventPoolCache().borrow(device->Id.value(), flags)), - zeCommandList(std::move(commandList)), queue(queue) { + ur_queue_t_ *queue, bool isImmediateCommandList) + : context(context), device(device), zeCommandList(std::move(commandList)), + queue(queue), isImmediateCommandList(isImmediateCommandList) { + auto &eventPoolTmp = isImmediateCommandList + ? context->getEventPoolCacheImmediate() + : context->getEventPoolCacheRegular(); + eventPool = eventPoolTmp.borrow(device->Id.value(), flags); UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); } - ur_command_list_manager::~ur_command_list_manager() { ur::level_zero::urContextRelease(context); ur::level_zero::urDeviceRelease(device); @@ -160,11 +162,30 @@ ur_result_t ur_command_list_manager::appendRegionCopyUnlocked( wait_list_view ur_command_list_manager::getWaitListView( const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, ur_event_handle_t additionalWaitEvent) { - + uint32_t numWaitEventsEnabled = 0; + if (isImmediateCommandList) { + for (uint32_t i = 0; i < numWaitEvents; i++) { + if (phWaitEvents[i]->getIsEventInUse()) { + numWaitEventsEnabled++; + } + } + } else { + numWaitEventsEnabled = numWaitEvents; + } uint32_t totalNumWaitEvents = numWaitEvents + (additionalWaitEvent != nullptr ? 1 : 0); waitList.resize(totalNumWaitEvents); for (uint32_t i = 0; i < numWaitEvents; i++) { + if (isImmediateCommandList && !phWaitEvents[i]->getIsEventInUse()) { + // We skip events on adding to immediate command list if they are not + // enabled + // TODO: This is a partial workaround for the underlying inconsistency + // between normal and counter events in L0 driver + // (the events that are not in use should be signaled by default, see + // /test/conformance/exp_command_buffer/kernel_event_sync.cpp + // KernelCommandEventSyncTest.SignalWaitBeforeEnqueue) + continue; + } waitList[i] = phWaitEvents[i]->getZeEvent(); } if (additionalWaitEvent != nullptr) { @@ -320,17 +341,18 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( return UR_RESULT_SUCCESS; } -ur_result_t -ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size, - ur_usm_advice_flags_t advice, - ur_event_handle_t *phEvent) { +ur_result_t ur_command_list_manager::appendUSMAdvise( + const void *pMem, size_t size, ur_usm_advice_flags_t advice, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise"); auto zeAdvice = ur_cast(advice); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); - auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); if (pWaitEvents) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp index 74c3f85ea364..5085a794fd83 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp @@ -39,7 +39,8 @@ struct ur_command_list_manager { ur_command_list_manager(ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, - v2::event_flags_t flags, ur_queue_t_ *queue); + v2::event_flags_t flags, ur_queue_t_ *queue, + bool isImmediateCommandList); ur_command_list_manager(const ur_command_list_manager &src) = delete; ur_command_list_manager(ur_command_list_manager &&src) = default; @@ -128,6 +129,8 @@ struct ur_command_list_manager { ur_result_t appendUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t appendBarrier(uint32_t numEventsInWaitList, @@ -170,4 +173,5 @@ struct ur_command_list_manager { v2::raii::command_list_unique_handle zeCommandList; ur_queue_t_ *queue; std::vector waitList; + bool isImmediateCommandList; }; diff --git a/unified-runtime/source/adapters/level_zero/v2/context.cpp b/unified-runtime/source/adapters/level_zero/v2/context.cpp index 050511d379b0..2e6ef1c8eeaa 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.cpp @@ -53,7 +53,7 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext, commandListCache(hContext, {phDevices[0]->Platform->ZeCopyOffloadExtensionSupported, phDevices[0]->Platform->ZeMutableCmdListExt.Supported}), - eventPoolCache( + eventPoolCacheImmediate( this, phDevices[0]->Platform->getNumDevices(), [context = this](DeviceId /* deviceId*/, v2::event_flags_t flags) -> std::unique_ptr { @@ -63,6 +63,19 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext, return std::make_unique( context, v2::QUEUE_IMMEDIATE, flags); }), + eventPoolCacheRegular(this, phDevices[0]->Platform->getNumDevices(), + [context = this, platform = phDevices[0]->Platform]( + DeviceId deviceId, v2::event_flags_t flags) + -> std::unique_ptr { + assert((flags & v2::EVENT_FLAGS_COUNTER) != 0); + + std::ignore = deviceId; + std::ignore = platform; + + // TODO: just use per-context id? + return std::make_unique( + context, v2::QUEUE_REGULAR, flags); + }), nativeEventsPool(this, std::make_unique( this, v2::QUEUE_IMMEDIATE, v2::EVENT_FLAGS_PROFILING_ENABLED)), diff --git a/unified-runtime/source/adapters/level_zero/v2/context.hpp b/unified-runtime/source/adapters/level_zero/v2/context.hpp index 03bc20aa4617..c0f4cbe113c0 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.hpp @@ -34,7 +34,12 @@ struct ur_context_handle_t_ : ur_object { getP2PDevices(ur_device_handle_t hDevice) const; v2::event_pool &getNativeEventsPool() { return nativeEventsPool; } - v2::event_pool_cache &getEventPoolCache() { return eventPoolCache; } + v2::event_pool_cache &getEventPoolCacheImmediate() { + return eventPoolCacheImmediate; + } + v2::event_pool_cache &getEventPoolCacheRegular() { + return eventPoolCacheRegular; + } v2::command_list_cache_t &getCommandListCache() { return commandListCache; } // Checks if Device is covered by this context. @@ -45,7 +50,8 @@ struct ur_context_handle_t_ : ur_object { const v2::raii::ze_context_handle_t hContext; const std::vector hDevices; v2::command_list_cache_t commandListCache; - v2::event_pool_cache eventPoolCache; + v2::event_pool_cache eventPoolCacheImmediate; + v2::event_pool_cache eventPoolCacheRegular; // pool used for urEventCreateWithNativeHandle when native handle is NULL // (uses non-counter based events to allow for signaling from host) diff --git a/unified-runtime/source/adapters/level_zero/v2/event.cpp b/unified-runtime/source/adapters/level_zero/v2/event.cpp index ec3bf20b467b..f5eddd5187f5 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.cpp @@ -141,6 +141,11 @@ uint64_t ur_event_handle_t_::getEventEndTimestamp() { return profilingData.getEventEndTimestamp(); } +void ur_event_handle_t_::markEventAsNotInUse() { isEventInUse = false; } +void ur_event_handle_t_::markEventAsInUse() { isEventInUse = true; } + +bool ur_event_handle_t_::getIsEventInUse() const { return isEventInUse; } + void ur_event_handle_t_::reset() { // consider make an abstraction for regular/counter based // events if there's more of this type of conditions @@ -232,6 +237,14 @@ ur_result_t urEventRelease(ur_event_handle_t hEvent) try { ur_result_t urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) try { for (uint32_t i = 0; i < numEvents; ++i) { + if (!phEventWaitList[i]->getIsEventInUse()) { + // TODO: This is a workaround for the underlying inconsistency + // between normal and counter events in L0 driver + // (the events that are not in use should be signaled by default, see + // /test/conformance/exp_command_buffer/kernel_event_sync.cpp + // KernelCommandEventSyncTest.SignalWaitBeforeEnqueue) + continue; + } ZE2UR_CALL(zeEventHostSynchronize, (phEventWaitList[i]->getZeEvent(), UINT64_MAX)); } diff --git a/unified-runtime/source/adapters/level_zero/v2/event.hpp b/unified-runtime/source/adapters/level_zero/v2/event.hpp index 6ed0ebccbc56..ab0588e25706 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.hpp @@ -110,6 +110,9 @@ struct ur_event_handle_t_ : ur_object { uint64_t getEventStartTimestmap() const; uint64_t getEventEndTimestamp(); + void markEventAsInUse(); + void markEventAsNotInUse(); + bool getIsEventInUse() const; private: ur_event_handle_t_(ur_context_handle_t hContext, event_variant hZeEvent, @@ -128,6 +131,8 @@ struct ur_event_handle_t_ : ur_object { ur_command_t commandType = UR_COMMAND_FORCE_UINT32; ur_device_handle_t hDevice = nullptr; + // tells if event has been enqueued in some way (e.g. by appending to a queue) + bool isEventInUse = true; v2::event_flags_t flags; event_profiling_data_t profilingData; }; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 33c05a140201..7ce6fa3bf190 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -76,7 +76,7 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), getZeIndex(pProps)), - eventFlagsFromQueueFlags(flags), this) {} + eventFlagsFromQueueFlags(flags), this, true) {} ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, @@ -93,7 +93,7 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( } } }), - eventFlagsFromQueueFlags(flags), this) {} + eventFlagsFromQueueFlags(flags), this, true) {} ze_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent( locked &commandList, ur_event_handle_t *hUserEvent, @@ -605,7 +605,8 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMAdvise"); auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMAdvise(pMem, size, advice, phEvent)); + UR_CALL(commandListLocked->appendUSMAdvise(pMem, size, advice, 0, nullptr, + phEvent)); return UR_RESULT_SUCCESS; } @@ -912,6 +913,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBufferExp( 1, &commandBufferCommandList, phEvent, numEventsInWaitList, phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, executionEvent)); UR_CALL(hCommandBuffer->registerExecutionEventUnlocked(*phEvent)); + hCommandBuffer->enableEvents(); if (internalEvent != nullptr) { internalEvent->release(); } diff --git a/unified-runtime/test/conformance/exp_command_buffer/fixtures.h b/unified-runtime/test/conformance/exp_command_buffer/fixtures.h index b45b60812319..77ffbbd60dc4 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/fixtures.h +++ b/unified-runtime/test/conformance/exp_command_buffer/fixtures.h @@ -377,6 +377,7 @@ struct urCommandEventSyncUpdateTest : urCommandEventSyncTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urCommandEventSyncTest::SetUp()); + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); auto required_capabilities = UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS; UUR_RETURN_ON_FATAL_FAILURE( diff --git a/unified-runtime/test/conformance/exp_command_buffer/rect_read.cpp b/unified-runtime/test/conformance/exp_command_buffer/rect_read.cpp index a0adf21ac7de..65209d0c2c1b 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/rect_read.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/rect_read.cpp @@ -59,9 +59,9 @@ static std::vector generateParameterizations() { 64, 8, 64); // Tests that a 4x16x2 region can be read from a 8x32x1 device buffer at // offset {1,2,0} to a 8x32x4 host buffer at offset {4,1,3}. - PARAMETERIZATION(write_2d_3d, 256, 1024, (ur_rect_offset_t{1, 2, 0}), - (ur_rect_offset_t{4, 1, 3}), (ur_rect_region_t{4, 16, 1}), 8, - 256, 8, 256); + // PARAMETERIZATION(write_2d_3d, 256, 1024, (ur_rect_offset_t{1, 2, 0}), + // (ur_rect_offset_t{4, 1, 3}), (ur_rect_region_t{4, 16, 1}), 8, + // 256, 8, 256); // Tests that a 1x4x1 region can be read from a 8x16x4 device buffer at // offset {7,3,3} to a 2x8x1 host buffer at offset {1,3,0}. // PARAMETERIZATION(write_3d_2d, 512, 16, (ur_rect_offset_t{7, 3, 3}),