Skip to content

Commit dcaf532

Browse files
committed
feature: events deadlock detection in validation layer
Related-To: NEO-12810 Signed-off-by: Chandio, Bibrak Qamar <[email protected]>
1 parent 3adf129 commit dcaf532

File tree

2 files changed

+44
-40
lines changed

2 files changed

+44
-40
lines changed

samples/zello_events_deadlock/zello_events_deadlock.cpp

Lines changed: 28 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -130,13 +130,14 @@ int main(int argc, char *argv[]) {
130130
exit(1);
131131
}
132132

133-
// Create an immediate command list for direct submission
134-
ze_command_queue_desc_t altdesc = {};
135-
altdesc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
133+
// Create an command list for direct submission
134+
ze_command_list_desc_t altdesc = {};
135+
altdesc.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC;
136+
136137
ze_command_list_handle_t command_list = {};
137-
status = zeCommandListCreateImmediate(context, pDevice, &altdesc, &command_list);
138+
status = zeCommandListCreate(context, pDevice, &altdesc, &command_list);
138139
if (status != ZE_RESULT_SUCCESS) {
139-
std::cout << "zeCommandListCreateImmediate Failed with return code: " << to_string(status) << std::endl;
140+
std::cout << "zeCommandListCreate Failed with return code: " << to_string(status) << std::endl;
140141
exit(1);
141142
}
142143

@@ -155,7 +156,7 @@ int main(int argc, char *argv[]) {
155156
}
156157

157158
std::vector<ze_event_handle_t> event{};
158-
// Two events for memcpy that will form a dependency on a 3rd event
159+
// Three events for memcpy that will form a circular dependency.
159160
event.resize(3);
160161

161162
ze_event_desc_t ev_desc = {};
@@ -178,9 +179,6 @@ int main(int argc, char *argv[]) {
178179
ze_event_handle_t start_event;
179180
SUCCESS_OR_TERMINATE(zeEventCreate(event_pool, &ev_desc, &start_event)); */
180181

181-
std::cout << std::endl
182-
<< std::endl;
183-
184182
ze_host_mem_alloc_desc_t host_desc = {};
185183
host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
186184
host_desc.pNext = nullptr;
@@ -193,7 +191,7 @@ int main(int argc, char *argv[]) {
193191
ze_device_mem_alloc_desc_t device_desc = {};
194192
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
195193
device_desc.pNext = nullptr;
196-
device_desc.ordinal = 0;
194+
// device_desc.ordinal = 0;
197195
device_desc.flags = 0;
198196

199197
void *device_mem_ptr = nullptr;
@@ -203,20 +201,21 @@ int main(int argc, char *argv[]) {
203201
<< std::endl;
204202

205203
// Action_0: Host to Device, is dependent on a future action called Action_2 (see below).
206-
// SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[0], 1 /* 1 */, &event[2] /* &start_event */));
207-
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[0], 0, nullptr));
208-
std::cout << std::endl
209-
<< std::endl;
204+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[0], 1 /* 1 */, &event[2] /* &start_event */));
205+
// SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[0], 0, nullptr));
206+
/* std::cout << std::endl
207+
<< std::endl; */
210208

211209
// Action_1: Host to Device, is dependent on Action_0
212210
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[1], 1, &event[0]));
213-
std::cout << std::endl
214-
<< std::endl;
211+
/* std::cout << std::endl
212+
<< std::endl; */
215213

216214
// Action_2: Host to Device, is dependent on Action_1. It also creates a deadlock by having Action_0 dependent on it.
217-
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, nullptr /* event[2] */, 1, &event[1]));
218-
std::cout << std::endl
219-
<< std::endl;
215+
// SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, nullptr /* event[2] */, 1, &event[1]));
216+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[2], 1, &event[1]));
217+
/* std::cout << std::endl
218+
<< std::endl; */
220219

221220
std::cout << "\n\n\n";
222221

@@ -225,43 +224,38 @@ int main(int argc, char *argv[]) {
225224
ze_command_queue_desc_t command_queue_description{};
226225
command_queue_description.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
227226
command_queue_description.pNext = nullptr;
228-
command_queue_description.ordinal = 0;
229-
command_queue_description.index = 0;
227+
// command_queue_description.ordinal = 0;
228+
// command_queue_description.index = 0;
230229
command_queue_description.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
231230

232231
ze_command_queue_handle_t command_queue{};
233232
SUCCESS_OR_TERMINATE(zeCommandQueueCreate(context, pDevice, &command_queue_description, &command_queue));
234233

235-
// This segfaults. TODO!!! Fix
234+
// Explicitly break the dependency by signaling the last event.
235+
// zeEventHostSignal(event[2]);
236+
236237
SUCCESS_OR_TERMINATE(zeCommandQueueExecuteCommandLists(command_queue, 1, &command_list, nullptr));
237238

238239
SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(command_queue, UINT64_MAX));
239240

240241
// SUCCESS_OR_TERMINATE(zeEventHostSignal(start_event));
241242

242-
// signal the event from the device and wait for completion
243-
244-
// zeCommandListAppendSignalEvent(command_list, event[0]);
245-
// zeEventHostSynchronize(event[0], UINT64_MAX);
246-
247243
std::cout << "Congratulations, the device completed execution!\n";
248244

249245
SUCCESS_OR_TERMINATE(zeCommandQueueDestroy(command_queue));
250246

251-
// These two hang. TODO!!! Fix
252-
/* SUCCESS_OR_TERMINATE(zeMemFree(context, host_mem_ptr));
253-
SUCCESS_OR_TERMINATE(zeMemFree(context, device_mem_ptr)); */
247+
SUCCESS_OR_TERMINATE(zeMemFree(context, host_mem_ptr));
248+
SUCCESS_OR_TERMINATE(zeMemFree(context, device_mem_ptr));
254249

255250
SUCCESS_OR_TERMINATE(zeEventDestroy(event[0]));
256251
SUCCESS_OR_TERMINATE(zeEventDestroy(event[1]));
257252
SUCCESS_OR_TERMINATE(zeEventDestroy(event[2]));
258253
// SUCCESS_OR_TERMINATE(zeEventDestroy(start_event));
259254

260-
// These these hang. TODO!!! Fix
261-
/* SUCCESS_OR_TERMINATE(zeEventPoolDestroy(event_pool));
255+
SUCCESS_OR_TERMINATE(zeEventPoolDestroy(event_pool));
262256
SUCCESS_OR_TERMINATE(zeCommandListDestroy(command_list));
263257

264-
SUCCESS_OR_TERMINATE(zeContextDestroy(context));*/
258+
SUCCESS_OR_TERMINATE(zeContextDestroy(context));
265259

266260
if (tracing_runtime_enabled) {
267261
std::cout << "Disable Tracing Layer after init" << std::endl;
@@ -271,6 +265,6 @@ int main(int argc, char *argv[]) {
271265
exit(1);
272266
}
273267
}
274-
std::cout << "Returning with 0 looks like it hangs here ... ???" << std::endl;
268+
275269
return 0;
276270
}

source/layers/validation/checkers/events_deadlock/zel_events_deadlock_checker.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListUpdateMutableComman
448448
ze_event_handle_t hSignalEvent ///< [in][optional] handle of the event to signal on completion
449449
) {
450450
// TODO: Implememt this
451-
// checkForDeadlock("zeCommandListUpdateMutableCommandSignalEventExp", hSignalEvent, 0, nullptr);
451+
checkForDeadlock("zeCommandListUpdateMutableCommandSignalEventExp", hSignalEvent, 0, nullptr);
452452

453453
return ZE_RESULT_SUCCESS;
454454
}
@@ -530,6 +530,16 @@ eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListImmediateAppendComm
530530
void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::string zeCallDisc, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
531531
int this_action_new_node_id = invalidDagID;
532532

533+
// Check if user is using invalid events, hint if it doesn't exist in eventToDagID
534+
if (eventToDagID.find(hSignalEvent) == eventToDagID.end()) {
535+
std::cerr << "Warning: Wait event " << hSignalEvent << " does not exist in eventToDagID map. It might be an invalid event." << std::endl;
536+
}
537+
for (uint32_t i = 0; i < numWaitEvents; i++) {
538+
if (eventToDagID.find(phWaitEvents[i]) == eventToDagID.end()) {
539+
std::cerr << "Warning: Wait event " << phWaitEvents[i] << " does not exist in eventToDagID map. It might be an invalid event." << std::endl;
540+
}
541+
}
542+
533543
if (hSignalEvent != nullptr) {
534544
auto it = eventToDagID.find(hSignalEvent);
535545
if (it != eventToDagID.end() && it->second != invalidDagID) {
@@ -592,15 +602,15 @@ void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::strin
592602
// std::cerr << "\t\tThere is already a path from " << this_action_new_node_id << " to " << dagID << ": " << dag.Path(this_action_new_node_id, dagID, 5) << std::endl;
593603
auto path = dag.PathDagIDs(this_action_new_node_id, dagID, 5);
594604

595-
std::string dependencyPrefix = "|\n\t-> ";
596-
std::cerr << "There is already a path from:\n";
605+
std::string spacePrefix = "";
606+
std::cerr << "Warning: There may be a potential event deadlock! There is already a path from:\n";
597607
auto dagIDsInPath = path.first;
598608
std::cerr << getActionDetails(dagIDsInPath[0]) << "\n";
599609
for (uint32_t i = 1; i < dagIDsInPath.size(); i++) {
600-
std::cerr << dependencyPrefix << getActionDetails(dagIDsInPath[i]) << "\n";
610+
std::cerr << spacePrefix << "|\n"
611+
<< spacePrefix << "-> " << getActionDetails(dagIDsInPath[i]) << "\n";
612+
spacePrefix += " ";
601613
}
602-
603-
std::cerr << "\tWarning: There may be a potential event deadlock!" << std::endl;
604614
}
605615
} else {
606616
std::cerr << "eventsDeadlockChecker: zeCommandListAppendMemoryCopyPrologue: Error: Wait event not found in eventToDagID map" << std::endl;

0 commit comments

Comments
 (0)