diff --git a/source/layers/validation/checkers/events_checker/CMakeLists.txt b/source/layers/validation/checkers/events_checker/CMakeLists.txt new file mode 100644 index 00000000..280210de --- /dev/null +++ b/source/layers/validation/checkers/events_checker/CMakeLists.txt @@ -0,0 +1,12 @@ +target_sources(${TARGET_NAME} + PRIVATE + ${CMAKE_CURRENT_LIST_DIR}/zel_events_checker.h + ${CMAKE_CURRENT_LIST_DIR}/zel_events_checker.cpp + + ${CMAKE_SOURCE_DIR}/third_party/xla/graphcycles.cc +) + +target_include_directories(${TARGET_NAME} + PRIVATE + ${CMAKE_SOURCE_DIR}/third_party +) diff --git a/source/layers/validation/checkers/events_checker/DESIGN.md b/source/layers/validation/checkers/events_checker/DESIGN.md new file mode 100644 index 00000000..c8c565e8 --- /dev/null +++ b/source/layers/validation/checkers/events_checker/DESIGN.md @@ -0,0 +1,44 @@ +# Events Deadlock Checker Design Document +This document outlines the design and implementation details of the events checker. + +## Introduction +The Events Checker validates usage of events. +- It is designed to detect potential deadlocks that might occur due to improper event usage in the Level Zero API. It prints out warning messages for user when it detects a potential deadlock. +- It may also detect whether an event is being used more than once without being reset. Consider a case in which a single event is signaled from twice. + +## Objectives +- Detect potential deadlocks caused by event usage. +- Provide detailed information to help developers identify and resolve deadlock issues. + +## Design Overview +The checker creates an internal Directed Acyclic Graph (DAG) of dependencies between API calls (the actions that are nodes in the graph) and events (the edges in the graph). It also maintains a topological sort and when an attempt is made to insert an edge that causes a cycle in the graph it warns the user of a potential deadlock and returns without inserting that edge. + +## API Implemented +### zeEventCreate +### zeEventDestroy +### zeCommandListAppendMemoryCopy +### zeCommandListAppendWriteGlobalTimestamp +### zeCommandListAppendBarrier +### zeCommandListAppendMemoryRangesBarrier +### zeCommandListAppendMemoryFill +### zeCommandListAppendMemoryCopyRegion +### zeCommandListAppendMemoryCopyFromContext +### zeCommandListAppendImageCopy +### zeCommandListAppendImageCopyRegion +### zeCommandListAppendImageCopyToMemory +### zeCommandListAppendImageCopyFromMemory +### zeCommandListAppendSignalEvent +### zeCommandListAppendWaitOnEvents +### zeEventHostSignal +### zeCommandListAppendEventReset +### zeEventHostReset +### zeCommandListAppendQueryKernelTimestamp +### zeCommandListAppendLaunchKernel +### zeCommandListAppendLaunchCooperativeKernel +### zeCommandListAppendLaunchKernelIndirect +### zeCommandListAppendLaunchMultipleKernelsIndirect +### zeCommandListUpdateMutableCommandSignalEventExp +### zeCommandListUpdateMutableCommandWaitEventsExp +### zeCommandListAppendImageCopyToMemoryExt +### zeCommandListAppendImageCopyFromMemoryExt +### zeCommandListImmediateAppendCommandListsExp diff --git a/source/layers/validation/checkers/events_checker/zel_events_checker.cpp b/source/layers/validation/checkers/events_checker/zel_events_checker.cpp new file mode 100644 index 00000000..8f393cf4 --- /dev/null +++ b/source/layers/validation/checkers/events_checker/zel_events_checker.cpp @@ -0,0 +1,589 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + * @file zel_events_checker.cpp + * + */ +#include "zel_events_checker.h" + +#include +#include + +namespace validation_layer { +class eventsChecker eventsDeadlock_checker; + +eventsChecker::eventsChecker() { + + enableEventsDeadlock = getenv_tobool("ZEL_ENABLE_EVENTS_CHECKER"); + if (enableEventsDeadlock) { + eventsChecker::ZEeventsChecker *zeChecker = new eventsChecker::ZEeventsChecker; + eventsChecker::ZESeventsChecker *zesChecker = new eventsChecker::ZESeventsChecker; + eventsChecker::ZETeventsChecker *zetChecker = new eventsChecker::ZETeventsChecker; + eventsDeadlock_checker.zeValidation = zeChecker; + eventsDeadlock_checker.zesValidation = zesChecker; + eventsDeadlock_checker.zetValidation = zetChecker; + + validation_layer::context.validationHandlers.push_back(&eventsDeadlock_checker); + } +} + +eventsChecker::~eventsChecker() { + if (enableEventsDeadlock) { + delete eventsDeadlock_checker.zeValidation; + delete eventsDeadlock_checker.zesValidation; + delete eventsDeadlock_checker.zetValidation; + } +} + +ze_result_t eventsChecker::ZEeventsChecker::zeEventCreateEpilogue( + ze_event_pool_handle_t hEventPool, ///< [in] handle of the event pool + const ze_event_desc_t *desc, ///< [in] pointer to event descriptor + ze_event_handle_t *phEvent ///< [out] pointer to handle of event object created +) { + eventToDagID[*phEvent] = invalidDagID; + + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeEventDestroyEpilogue( + ze_event_handle_t hEvent ///< [in][release] handle of event object to destroy +) { + if (eventToDagID.find(hEvent) != eventToDagID.end()) { + // Delete event from eventToDagID but not from the dagIDToAction map as it may be needed for printing the discription of the action when printing path in the DAG. + eventToDagID.erase(hEvent); + } + + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendMemoryCopyPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + void *dstptr, ///< [in] pointer to destination memory to copy to + const void *srcptr, ///< [in] pointer to source memory to copy from + size_t size, ///< [in] size in bytes to copy + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendMemoryCopy", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendWriteGlobalTimestampPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + uint64_t *dstptr, ///< [in,out] pointer to memory where timestamp value will be written; must + ///< be 8byte-aligned. + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before executing query; + ///< must be 0 if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before executing query +) { + checkForDeadlock("zeCommandListAppendWriteGlobalTimestamp", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendBarrierPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before executing barrier; + ///< must be 0 if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before executing barrier +) { + checkForDeadlock("zeCommandListAppendBarrier", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendMemoryRangesBarrierPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + uint32_t numRanges, ///< [in] number of memory ranges + const size_t *pRangeSizes, ///< [in][range(0, numRanges)] array of sizes of memory range + const void **pRanges, ///< [in][range(0, numRanges)] array of memory ranges + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before executing barrier; + ///< must be 0 if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before executing barrier +) { + checkForDeadlock("zeCommandListAppendMemoryRangesBarrier", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendMemoryFillPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + void *ptr, ///< [in] pointer to memory to initialize + const void *pattern, ///< [in] pointer to value to initialize memory to + size_t pattern_size, ///< [in] size in bytes of the value to initialize memory to + size_t size, ///< [in] size in bytes to initialize + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendMemoryFill", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendMemoryCopyRegionPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + void *dstptr, ///< [in] pointer to destination memory to copy to + const ze_copy_region_t *dstRegion, ///< [in] pointer to destination region to copy to + uint32_t dstPitch, ///< [in] destination pitch in bytes + uint32_t dstSlicePitch, ///< [in] destination slice pitch in bytes. This is required for 3D region + ///< copies where the `depth` member of ::ze_copy_region_t is not 0, + ///< otherwise it's ignored. + const void *srcptr, ///< [in] pointer to source memory to copy from + const ze_copy_region_t *srcRegion, ///< [in] pointer to source region to copy from + uint32_t srcPitch, ///< [in] source pitch in bytes + uint32_t srcSlicePitch, ///< [in] source slice pitch in bytes. This is required for 3D region + ///< copies where the `depth` member of ::ze_copy_region_t is not 0, + ///< otherwise it's ignored. + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendMemoryCopyRegion", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendMemoryCopyFromContextPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + void *dstptr, ///< [in] pointer to destination memory to copy to + ze_context_handle_t hContextSrc, ///< [in] handle of source context object + const void *srcptr, ///< [in] pointer to source memory to copy from + size_t size, ///< [in] size in bytes to copy + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendMemoryCopyFromContext", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendImageCopyPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + ze_image_handle_t hDstImage, ///< [in] handle of destination image to copy to + ze_image_handle_t hSrcImage, ///< [in] handle of source image to copy from + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendImageCopy", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendImageCopyRegionPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + ze_image_handle_t hDstImage, ///< [in] handle of destination image to copy to + ze_image_handle_t hSrcImage, ///< [in] handle of source image to copy from + const ze_image_region_t *pDstRegion, ///< [in][optional] destination region descriptor + const ze_image_region_t *pSrcRegion, ///< [in][optional] source region descriptor + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendImageCopyRegion", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendImageCopyToMemoryPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + void *dstptr, ///< [in] pointer to destination memory to copy to + ze_image_handle_t hSrcImage, ///< [in] handle of source image to copy from + const ze_image_region_t *pSrcRegion, ///< [in][optional] source region descriptor + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendImageCopyToMemory", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendImageCopyFromMemoryPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + ze_image_handle_t hDstImage, ///< [in] handle of destination image to copy to + const void *srcptr, ///< [in] pointer to source memory to copy from + const ze_image_region_t *pDstRegion, ///< [in][optional] destination region descriptor + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendImageCopyFromMemory", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendSignalEventPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_event_handle_t hEvent ///< [in] handle of the event +) { + checkForDeadlock("zeCommandListAppendSignalEvent", hEvent, 0, nullptr); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendWaitOnEventsPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + uint32_t numEvents, ///< [in] number of events to wait on before continuing + ze_event_handle_t *phEvents ///< [in][range(0, numEvents)] handles of the events to wait on before + ///< continuing +) { + checkForDeadlock("zeCommandListAppendWaitOnEvents", nullptr, numEvents, phEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeEventHostSignalPrologue( + ze_event_handle_t hEvent ///< [in] handle of the event +) { + checkForDeadlock("zeEventHostSignal", hEvent, 0, nullptr); + return ZE_RESULT_SUCCESS; +} + +void eventsChecker::ZEeventsChecker::resetEventInEventToDagID( + const std::string &zeCallDisc, /// action discription + const ze_event_handle_t hEvent ///< [in] handle of the event +) { + auto it = eventToDagID.find(hEvent); + // Check if user is using invalid events, hint if it doesn't exist in eventToDagID. + if (it == eventToDagID.end()) { + std::cerr << "Warning: hSignalEvent {" << hEvent << "} might be an invalid event in call to " << zeCallDisc << std::endl; + return; + } + + if (it->second != invalidDagID) { + + auto action = dagIDToAction.find(it->second); + if (action != dagIDToAction.end()) { + action->second.second = invalidEventAddress; // Reset + } + + it->second = invalidDagID; // Reset + } +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendEventResetPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_event_handle_t hEvent ///< [in] handle of the event +) { + resetEventInEventToDagID("zeCommandListAppendEventReset", hEvent); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeEventHostResetPrologue( + ze_event_handle_t hEvent ///< [in] handle of the event +) { + resetEventInEventToDagID("zeEventHostReset", hEvent); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendQueryKernelTimestampsPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + uint32_t numEvents, ///< [in] the number of timestamp events to query + ze_event_handle_t *phEvents, ///< [in][range(0, numEvents)] handles of timestamp events to query + void *dstptr, ///< [in,out] pointer to memory where ::ze_kernel_timestamp_result_t will + ///< be written; must be size-aligned. + const size_t *pOffsets, ///< [in][optional][range(0, numEvents)] offset, in bytes, to write + ///< results; address must be 4byte-aligned and offsets must be + ///< size-aligned. + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before executing query; + ///< must be 0 if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before executing query +) { + checkForDeadlock("zeCommandListAppendQueryKernelTimestamps", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendLaunchKernelPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object + const ze_group_count_t *pLaunchFuncArgs, ///< [in] thread group launch arguments + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendLaunchKernel", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendLaunchCooperativeKernelPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object + const ze_group_count_t *pLaunchFuncArgs, ///< [in] thread group launch arguments + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendLaunchCooperativeKernel", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendLaunchKernelIndirectPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object + const ze_group_count_t *pLaunchArgumentsBuffer, ///< [in] pointer to device buffer that will contain thread group launch + ///< arguments + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendLaunchKernelIndirect", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendLaunchMultipleKernelsIndirectPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + uint32_t numKernels, ///< [in] maximum number of kernels to launch + ze_kernel_handle_t *phKernels, ///< [in][range(0, numKernels)] handles of the kernel objects + const uint32_t *pCountBuffer, ///< [in] pointer to device memory location that will contain the actual + ///< number of kernels to launch; value must be less than or equal to + ///< numKernels + const ze_group_count_t *pLaunchArgumentsBuffer, ///< [in][range(0, numKernels)] pointer to device buffer that will contain + ///< a contiguous array of thread group launch arguments + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendLaunchMultipleKernelsIndirect", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListUpdateMutableCommandSignalEventExpPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + uint64_t commandId, ///< [in] command identifier + ze_event_handle_t hSignalEvent ///< [in][optional] handle of the event to signal on completion +) { + checkForDeadlock("zeCommandListUpdateMutableCommandSignalEventExp", hSignalEvent, 0, nullptr); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListUpdateMutableCommandWaitEventsExpPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + uint64_t commandId, ///< [in] command identifier + uint32_t numWaitEvents, ///< [in][optional] the number of wait events + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListUpdateMutableCommandWaitEventsExp", nullptr, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendImageCopyToMemoryExtPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + void *dstptr, ///< [in] pointer to destination memory to copy to + ze_image_handle_t hSrcImage, ///< [in] handle of source image to copy from + const ze_image_region_t *pSrcRegion, ///< [in][optional] source region descriptor + uint32_t destRowPitch, ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D + ///< image or each image of a 1D or 2D image array being written + uint32_t destSlicePitch, ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or + ///< each image of a 1D or 2D image array being written + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendImageCopyToMemoryExt", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListAppendImageCopyFromMemoryExtPrologue( + ze_command_list_handle_t hCommandList, ///< [in] handle of command list + ze_image_handle_t hDstImage, ///< [in] handle of destination image to copy to + const void *srcptr, ///< [in] pointer to source memory to copy from + const ze_image_region_t *pDstRegion, ///< [in][optional] destination region descriptor + uint32_t srcRowPitch, ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D + ///< image or each image of a 1D or 2D image array being read + uint32_t srcSlicePitch, ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or + ///< each image of a 1D or 2D image array being read + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 + ///< if `nullptr == phWaitEvents` + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before launching +) { + checkForDeadlock("zeCommandListAppendImageCopyFromMemoryExt", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +ze_result_t +eventsChecker::ZEeventsChecker::zeCommandListImmediateAppendCommandListsExpPrologue( + ze_command_list_handle_t hCommandListImmediate, ///< [in] handle of the immediate command list + uint32_t numCommandLists, ///< [in] number of command lists + ze_command_list_handle_t *phCommandLists, ///< [in][range(0, numCommandLists)] handles of command lists + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + ///< - if not null, this event is signaled after the completion of all + ///< appended command lists + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before executing appended + ///< command lists; must be 0 if nullptr == phWaitEvents + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait + ///< on before executing appended command lists. + ///< - if not null, all wait events must be satisfied prior to the start + ///< of any appended command list(s) +) { + checkForDeadlock("zeCommandListImmediateAppendCommandListsExp", hSignalEvent, numWaitEvents, phWaitEvents); + return ZE_RESULT_SUCCESS; +} + +void eventsChecker::ZEeventsChecker::validateSignalEventOwnership(const std::string &zeCallDisc, + const ze_event_handle_t hSignalEvent) { + const auto it = eventToDagID.find(hSignalEvent); + const auto dagID = it->second; + if (it != eventToDagID.end() && dagID != invalidDagID) { + std::string previousActionOwner = (dagIDToAction.find(dagID) != dagIDToAction.end()) ? dagIDToAction.find(dagID)->second.first : "UNKNOWN ACTION"; + std::cerr << "Warning: " << zeCallDisc << " is using the same ze_event_handle_t for signal {" << hSignalEvent << "} which has been previously used by: " << previousActionOwner << std::endl; + } +} + +void eventsChecker::ZEeventsChecker::checkForDeadlock( + const std::string &zeCallDisc, /// action discription + const ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to forming the outgoing edge in the DAG + const uint32_t numWaitEvents, ///< [in][optional] number of events that point to this action. + const ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events that point to this action. +) { + uint32_t this_action_new_node_id = invalidDagID; + + if (hSignalEvent != nullptr) { + + auto it = eventToDagID.find(hSignalEvent); + // Check if user is using invalid events, hint if it doesn't exist in eventToDagID. + if (it == eventToDagID.end()) { + std::cerr << "Warning: hSignalEvent {" << hSignalEvent << "} might be an invalid event in call to " << zeCallDisc << std::endl; + return; + } + + // A passive check to see if the user is using the same event for multiple actions. + // It only print warnings and does not stop the event deadlock checker. + validateSignalEventOwnership(zeCallDisc, hSignalEvent); + + if (it->second != invalidDagID) { + // This event already exists in the DAG. Get the DAG node ID. + // For example when there is indeed a deadlock it would have already been created. + this_action_new_node_id = it->second; + } + } + + for (uint32_t i = 0; i < numWaitEvents; i++) { + if (eventToDagID.find(phWaitEvents[i]) == eventToDagID.end()) { + std::cerr << "Warning: phWaitEvents {" << hSignalEvent << "} might be an invalid event in call to " << zeCallDisc << std::endl; + return; + } + } + + if (this_action_new_node_id == invalidDagID) { + // Create node in DAG + this_action_new_node_id = addNodeInDag(); + + // Now we know where the hSignalEvent points from/out in the DAG. Update the eventToDagID map. + eventToDagID[hSignalEvent] = this_action_new_node_id; + } + + // Add this action to the actionToDagID map. + std::ostringstream oss; + oss << zeCallDisc << ": (hSignalEvent{" << hSignalEvent << "}, phWaitEvents{"; + + for (uint32_t i = 0; i < numWaitEvents; i++) { + oss << phWaitEvents[i]; + if (i < numWaitEvents - 1) { + oss << ", "; + } + } + oss << "})"; + + std::string action = oss.str(); // Convert the stream to a string. + dagIDToAction[this_action_new_node_id] = actionAndSignalEvent(action, hSignalEvent); + + // Form the dependency in the DAG + for (uint32_t i = 0; i < numWaitEvents; i++) { + auto it = eventToDagID.find(phWaitEvents[i]); + + uint32_t dagID = it->second; + if (dagID == invalidDagID) { + // Create a new node in the DAG for this wait event. That action will be created some time in the future. + dagID = addNodeInDag(); + it->second = dagID; + } + + auto getActionDetails = [&](int dagID) -> std::string { + auto actionIt = dagIDToAction.find(dagID); + return (actionIt != dagIDToAction.end()) ? actionIt->second.first : "PLACEHOLDER"; + }; + + if (!addEdgeInDag(dagID, this_action_new_node_id)) { + std::string fromAction = getActionDetails(dagID); + std::string toAction = getActionDetails(this_action_new_node_id); + + std::cerr << "Warning: There may be a potential event deadlock!\n"; + std::cerr << "Adding the following dependency would create a cycle in the DAG:\n\tFrom: " << fromAction << "\n\tTo: " << toAction << "\n"; + std::cerr << "There is already a path:\n"; + + constexpr uint32_t maxPathLength = 15; + auto path = dag.PathDagIDs(this_action_new_node_id, dagID, maxPathLength); + auto dagIDsInPath = path.first; + std::cerr << getActionDetails(dagIDsInPath[0]) << "\n"; + std::string spacePrefix = ""; + for (uint32_t j = 1; j < dagIDsInPath.size(); j++) { + std::cerr << spacePrefix << "|\n" + << spacePrefix << "-> " << getActionDetails(dagIDsInPath[j]) << "\n"; + spacePrefix += " "; + } + if (path.second) { + std::cerr << spacePrefix << "|\n" + << spacePrefix << "-> ...\n"; + } + } + } +} + +} // namespace validation_layer diff --git a/source/layers/validation/checkers/events_checker/zel_events_checker.h b/source/layers/validation/checkers/events_checker/zel_events_checker.h new file mode 100644 index 00000000..f92c1689 --- /dev/null +++ b/source/layers/validation/checkers/events_checker/zel_events_checker.h @@ -0,0 +1,94 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + * @file zel_events_checker.h + * + */ + +#pragma once + +#include "xla/graphcycles.h" +#include "ze_api.h" +#include "ze_validation_layer.h" + +#include +#include + +namespace validation_layer { + +constexpr uint32_t invalidDagID = (std::numeric_limits::max)(); +constexpr ze_event_handle_t invalidEventAddress = (std::numeric_limits::max)(); +using actionAndSignalEvent = std::pair; + +class __zedlllocal eventsChecker : public validationChecker { + public: + eventsChecker(); + ~eventsChecker(); + + class ZEeventsChecker : public ZEValidationEntryPoints { + public: + ze_result_t zeEventCreateEpilogue(ze_event_pool_handle_t hEventPool, const ze_event_desc_t *desc, ze_event_handle_t *phEvent) override; + ze_result_t zeEventDestroyEpilogue(ze_event_handle_t hEvent) override; + ze_result_t zeCommandListAppendMemoryCopyPrologue(ze_command_list_handle_t hCommandList, void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendWriteGlobalTimestampPrologue(ze_command_list_handle_t hCommandList, uint64_t *dstptr, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendBarrierPrologue(ze_command_list_handle_t hCommandList, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendMemoryRangesBarrierPrologue(ze_command_list_handle_t hCommandList, uint32_t numRanges, const size_t *pRangeSizes, const void **pRanges, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendMemoryFillPrologue(ze_command_list_handle_t hCommandList, void *ptr, const void *pattern, size_t pattern_size, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendMemoryCopyRegionPrologue(ze_command_list_handle_t hCommandList, void *dstptr, const ze_copy_region_t *dstRegion, uint32_t dstPitch, uint32_t dstSlicePitch, const void *srcptr, const ze_copy_region_t *srcRegion, uint32_t srcPitch, uint32_t srcSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendMemoryCopyFromContextPrologue(ze_command_list_handle_t hCommandList, void *dstptr, ze_context_handle_t hContextSrc, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendImageCopyPrologue(ze_command_list_handle_t hCommandList, ze_image_handle_t hDstImage, ze_image_handle_t hSrcImage, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendImageCopyRegionPrologue(ze_command_list_handle_t hCommandList, ze_image_handle_t hDstImage, ze_image_handle_t hSrcImage, const ze_image_region_t *pDstRegion, const ze_image_region_t *pSrcRegion, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendImageCopyToMemoryPrologue(ze_command_list_handle_t hCommandList, void *dstptr, ze_image_handle_t hSrcImage, const ze_image_region_t *pSrcRegion, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendImageCopyFromMemoryPrologue(ze_command_list_handle_t hCommandList, ze_image_handle_t hDstImage, const void *srcptr, const ze_image_region_t *pDstRegion, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendSignalEventPrologue(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent) override; + ze_result_t zeCommandListAppendWaitOnEventsPrologue(ze_command_list_handle_t hCommandList, uint32_t numEvents, ze_event_handle_t *phEvents) override; + ze_result_t zeEventHostSignalPrologue(ze_event_handle_t hEvent) override; + ze_result_t zeCommandListAppendEventResetPrologue(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent) override; + ze_result_t zeEventHostResetPrologue(ze_event_handle_t hEvent) override; + ze_result_t zeCommandListAppendQueryKernelTimestampsPrologue(ze_command_list_handle_t hCommandList, uint32_t numEvents, ze_event_handle_t *phEvents, void *dstptr, const size_t *pOffsets, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendLaunchKernelPrologue(ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t *pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendLaunchCooperativeKernelPrologue(ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t *pLaunchFuncArgs, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendLaunchKernelIndirectPrologue(ze_command_list_handle_t hCommandList, ze_kernel_handle_t hKernel, const ze_group_count_t *pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendLaunchMultipleKernelsIndirectPrologue(ze_command_list_handle_t hCommandList, uint32_t numKernels, ze_kernel_handle_t *phKernels, const uint32_t *pCountBuffer, const ze_group_count_t *pLaunchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListUpdateMutableCommandSignalEventExpPrologue(ze_command_list_handle_t hCommandList, uint64_t commandId, ze_event_handle_t hSignalEvent) override; + ze_result_t zeCommandListUpdateMutableCommandWaitEventsExpPrologue(ze_command_list_handle_t hCommandList, uint64_t commandId, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendImageCopyToMemoryExtPrologue(ze_command_list_handle_t hCommandList, void *dstptr, ze_image_handle_t hSrcImage, const ze_image_region_t *pSrcRegion, uint32_t destRowPitch, uint32_t destSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListAppendImageCopyFromMemoryExtPrologue(ze_command_list_handle_t hCommandList, ze_image_handle_t hDstImage, const void *srcptr, const ze_image_region_t *pDstRegion, uint32_t srcRowPitch, uint32_t srcSlicePitch, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t zeCommandListImmediateAppendCommandListsExpPrologue(ze_command_list_handle_t hCommandListImmediate, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + + private: + // Add node in the DAG and get its ID. + int addNodeInDag() { return dag.NewNode(); } + + // Add edge in the DAG. + bool addEdgeInDag(uint32_t x, uint32_t y) { return dag.InsertEdge(x, y); } + + // In case the user uses a single hSignalEvent twice or more, which is an ill usage. + void validateSignalEventOwnership(const std::string &zeCallDisc, const ze_event_handle_t hSignalEvent); + + // Inserts new actions and events in the DAG based on the ze. + void checkForDeadlock(const std::string &zeCallDisc, const ze_event_handle_t hSignalEvent, const uint32_t numWaitEvents, const ze_event_handle_t *phWaitEvents); + + // Reset the event to have an invalid DAG ID such that it can be reused. + // Useful for zeCalls such as zeCommandListAppendEventReset and zeEventHostReset. + void resetEventInEventToDagID(const std::string &zeCallDisc, ze_event_handle_t hEvent); + + // The DAG structure. + xla::GraphCycles dag; + + // events point from/out to a DAG node. This map stores the DAG ID for each event (if there is one). + std::unordered_map eventToDagID; + + // This map acts as a bi-directional map to eventToDagID. It maps DAG ID to a pair containing action description and signal event. + std::unordered_map dagIDToAction; + }; + class ZESeventsChecker : public ZESValidationEntryPoints {}; + class ZETeventsChecker : public ZETValidationEntryPoints {}; + + bool enableEventsDeadlock = false; +}; +extern class eventsChecker eventsDeadlock_checker; +} // namespace validation_layer \ No newline at end of file