Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 14 additions & 41 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include <string>
#include <sycl/detail/pi.h>
#include <sycl/detail/spinlock.hpp>
#include <thread>
#include <utility>

#include <zet_api.h>
Expand Down Expand Up @@ -914,10 +913,7 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
}

// Thread id will be used to create separate queue groups per thread.
auto TID = std::this_thread::get_id();
ComputeQueueGroupsByTID.insert({TID, ComputeQueueGroup});
ComputeQueueGroupsByTID.set(ComputeQueueGroup);

// Copy group initialization.
pi_queue_group_t CopyQueueGroup{this, queue_type::MainCopy};
Expand All @@ -943,7 +939,7 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
}
}
}
CopyQueueGroupsByTID.insert({TID, CopyQueueGroup});
CopyQueueGroupsByTID.set(CopyQueueGroup);

// Initialize compute/copy command batches.
ComputeCommandBatch.OpenCommandList = CommandListMap.end();
Expand Down Expand Up @@ -1235,24 +1231,7 @@ pi_result _pi_context::getAvailableCommandList(

_pi_queue::pi_queue_group_t &_pi_queue::getQueueGroup(bool UseCopyEngine) {
auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID);
auto &InitialGroup = Map.begin()->second;

// Check if thread-specifc immediate commandlists are requested.
if (Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue) {
// Thread id is used to create separate imm cmdlists per thread.
auto Result = Map.insert({std::this_thread::get_id(), InitialGroup});
auto &QueueGroupRef = Result.first->second;
// If an entry for this thread does not exists, create an entry.
if (Result.second) {
// Create space for immediate commandlists, which are created on demand.
QueueGroupRef.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
InitialGroup.ZeQueues.size(), CommandListMap.end());
}
return QueueGroupRef;
}

// If not PerThreadPerQueue then use the groups from Queue creation time.
return InitialGroup;
return Map.get();
}

// Helper function to create a new command-list to this queue and associated
Expand Down Expand Up @@ -2521,13 +2500,13 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device,
// At this point only the thread creating the queue will have associated
// command-lists. Other threads have not accessed the queue yet. So we can
// only warmup the initial thread's command-lists.
auto InitialGroup = Q->ComputeQueueGroupsByTID.begin()->second;
PI_CALL(warmupQueueGroup(false, InitialGroup.UpperIndex -
InitialGroup.LowerIndex + 1));
auto QueueGroup = Q->ComputeQueueGroupsByTID.get();
PI_CALL(warmupQueueGroup(false, QueueGroup.UpperIndex -
QueueGroup.LowerIndex + 1));
if (Q->useCopyEngine()) {
auto InitialGroup = Q->CopyQueueGroupsByTID.begin()->second;
PI_CALL(warmupQueueGroup(true, InitialGroup.UpperIndex -
InitialGroup.LowerIndex + 1));
auto QueueGroup = Q->CopyQueueGroupsByTID.get();
PI_CALL(warmupQueueGroup(true, QueueGroup.UpperIndex -
QueueGroup.LowerIndex + 1));
}
// TODO: warmup event pools. Both host-visible and device-only.
}
Expand Down Expand Up @@ -2833,14 +2812,9 @@ pi_result piextQueueGetNativeHandle(pi_queue Queue,
auto ZeQueue = pi_cast<ze_command_queue_handle_t *>(NativeHandle);

// Extract a Level Zero compute queue handle from the given PI queue
auto &QueueGroup = Queue->getQueueGroup(false /*compute*/);
uint32_t QueueGroupOrdinalUnused;
auto TID = std::this_thread::get_id();
auto &InitialGroup = Queue->ComputeQueueGroupsByTID.begin()->second;
const auto &Result =
Queue->ComputeQueueGroupsByTID.insert({TID, InitialGroup});
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code was doing wrong it used the same ZeQueues for the new entry so it is then was destroyed multiple times causing a segfault. Now, without per-thread setting we don't even create multiple entries, and we do per-thread then the new entry is properly initialized here: https://github.com/intel/llvm/pull/8896/files#diff-1f19baf7cc76d94bf4a1819c81e7b34292fb3ce215cd50873e8c9236cc73601cR501

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feature was for immediate command lists only. So the zeQueues should have been unallocated. Have you extended it for command queues also?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So now the vector of ZeQueues is now separate for each entry?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is still for immediate command-lists only, but it is safer to initialize ZeQueues to null for safety

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But that value should have been a null, and the zecommandqueuedestroy should not have been called. But it did get called, which means that the vector of zequeus has to be explicitly set to nullptrs. OK.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it was nullptr but then a call togetZeQueue created a non-null one: https://github.com/intel/llvm/pull/8896/files#diff-15dd1eb076d2164bd9e87d9057f05f652a716498e8cdf5975e564c65309a0985L2843

That non-null value was then copied to yet a new entry created for another thread.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The perThread and getNative don't mix well. Say you have a queue that uses multiple cmd lists, one per thread. Someone calls getNative, makes his own queue from it and assumes that a sync on that queue is also a sync on the original queue. But it isn't because only one cmd list will be synch'ed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but the failure was even without per-thread requested. We still created multiple redundant entries, which additionally held the same queues. I don't think that syncing a queue returned by a get_native() gives any guarantees about the entire SYCL queue being synced.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That insert was supposed to be "insert if not found".

auto &ComputeQueueGroupRef = Result.first->second;

*ZeQueue = ComputeQueueGroupRef.getZeQueue(&QueueGroupOrdinalUnused);
*ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused);
return PI_SUCCESS;
}

Expand Down Expand Up @@ -5557,10 +5531,9 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
std::vector<pi_command_list_ptr_t> CmdLists;

// There must be at least one L0 queue.
auto &InitialComputeGroup = Queue->ComputeQueueGroupsByTID.begin()->second;
auto &InitialCopyGroup = Queue->CopyQueueGroupsByTID.begin()->second;
PI_ASSERT(!InitialComputeGroup.ZeQueues.empty() ||
!InitialCopyGroup.ZeQueues.empty(),
auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get();
auto &CopyGroup = Queue->CopyQueueGroupsByTID.get();
PI_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(),
PI_ERROR_INVALID_QUEUE);

size_t NumQueues = 0;
Expand Down
46 changes: 44 additions & 2 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,16 +468,58 @@ struct _pi_queue : _pi_object {
uint32_t NextIndex{0};
};

// Helper class to facilitate per-thread queue groups
// We maintain a hashtable of queue groups if requested to do them per-thread.
// Otherwise it is just single entry used for all threads.
struct pi_queue_group_by_tid_t
: public std::unordered_map<std::thread::id, pi_queue_group_t> {
bool PerThread = false;

// Returns thread id if doing per-thread, or a generic id that represents
// all the threads.
std::thread::id tid() const {
return PerThread ? std::this_thread::get_id() : std::thread::id();
}

// Make the specified queue group be the master
void set(const pi_queue_group_t &QueueGroup) {
const auto &Device = QueueGroup.Queue->Device;
PerThread = Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue;
assert(empty());
insert({tid(), QueueGroup});
}

// Get a queue group to use for this thread
pi_queue_group_t &get() {
assert(!empty());
auto It = find(tid());
if (It != end()) {
return It->second;
}
// Add new queue group for this thread initialized from a master entry.
auto QueueGroup = begin()->second;
// Create space for queues and immediate commandlists, which are created
// on demand.
QueueGroup.ZeQueues = std::vector<ze_command_queue_handle_t>(
QueueGroup.ZeQueues.size(), nullptr);
QueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
QueueGroup.ZeQueues.size(), QueueGroup.Queue->CommandListMap.end());

std::tie(It, std::ignore) = insert({tid(), QueueGroup});
return It->second;
}
};

// A map of compute groups containing compute queue handles, one per thread.
// When a queue is accessed from multiple host threads, a separate queue group
// is created for each thread. The key used for mapping is the thread ID.
std::unordered_map<std::thread::id, pi_queue_group_t> ComputeQueueGroupsByTID;
pi_queue_group_by_tid_t ComputeQueueGroupsByTID;

// A group containing copy queue handles. The main copy engine, if available,
// comes first followed by link copy engines, if available.
// When a queue is accessed from multiple host threads, a separate queue group
// is created for each thread. The key used for mapping is the thread ID.
std::unordered_map<std::thread::id, pi_queue_group_t> CopyQueueGroupsByTID;
pi_queue_group_by_tid_t CopyQueueGroupsByTID;

// Wait for all commandlists associated with this Queue to finish operations.
pi_result synchronize();
Expand Down