intel · againull · Mar 31, 2023 · Mar 30, 2023 · Mar 31, 2023 · smaslov-intel
@@ -22,7 +22,6 @@
 #include <string>
 #include <sycl/detail/pi.h>
 #include <sycl/detail/spinlock.hpp>
-#include <thread>
 #include <utility>
 
 #include <zet_api.h>
@@ -914,10 +913,7 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
     ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
         ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
   }
-
-  // Thread id will be used to create separate queue groups per thread.
-  auto TID = std::this_thread::get_id();
-  ComputeQueueGroupsByTID.insert({TID, ComputeQueueGroup});
+  ComputeQueueGroupsByTID.set(ComputeQueueGroup);
 
   // Copy group initialization.
   pi_queue_group_t CopyQueueGroup{this, queue_type::MainCopy};
@@ -943,7 +939,7 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
       }
     }
   }
-  CopyQueueGroupsByTID.insert({TID, CopyQueueGroup});
+  CopyQueueGroupsByTID.set(CopyQueueGroup);
 
   // Initialize compute/copy command batches.
   ComputeCommandBatch.OpenCommandList = CommandListMap.end();
@@ -1235,24 +1231,7 @@ pi_result _pi_context::getAvailableCommandList(
 
 _pi_queue::pi_queue_group_t &_pi_queue::getQueueGroup(bool UseCopyEngine) {
   auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID);
-  auto &InitialGroup = Map.begin()->second;
-
-  // Check if thread-specifc immediate commandlists are requested.
-  if (Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue) {
-    // Thread id is used to create separate imm cmdlists per thread.
-    auto Result = Map.insert({std::this_thread::get_id(), InitialGroup});
-    auto &QueueGroupRef = Result.first->second;
-    // If an entry for this thread does not exists, create an entry.
-    if (Result.second) {
-      // Create space for immediate commandlists, which are created on demand.
-      QueueGroupRef.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
-          InitialGroup.ZeQueues.size(), CommandListMap.end());
-    }
-    return QueueGroupRef;
-  }
-
-  // If not PerThreadPerQueue then use the groups from Queue creation time.
-  return InitialGroup;
+  return Map.get();
 }
 
 // Helper function to create a new command-list to this queue and associated
@@ -2521,13 +2500,13 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device,
     // At this point only the thread creating the queue will have associated
     // command-lists. Other threads have not accessed the queue yet. So we can
     // only warmup the initial thread's command-lists.
-    auto InitialGroup = Q->ComputeQueueGroupsByTID.begin()->second;
-    PI_CALL(warmupQueueGroup(false, InitialGroup.UpperIndex -
-                                        InitialGroup.LowerIndex + 1));
+    auto QueueGroup = Q->ComputeQueueGroupsByTID.get();
+    PI_CALL(warmupQueueGroup(false, QueueGroup.UpperIndex -
+                                        QueueGroup.LowerIndex + 1));
     if (Q->useCopyEngine()) {
-      auto InitialGroup = Q->CopyQueueGroupsByTID.begin()->second;
-      PI_CALL(warmupQueueGroup(true, InitialGroup.UpperIndex -
-                                         InitialGroup.LowerIndex + 1));
+      auto QueueGroup = Q->CopyQueueGroupsByTID.get();
+      PI_CALL(warmupQueueGroup(true, QueueGroup.UpperIndex -
+                                         QueueGroup.LowerIndex + 1));
     }
     // TODO: warmup event pools. Both host-visible and device-only.
   }
@@ -2833,14 +2812,9 @@ pi_result piextQueueGetNativeHandle(pi_queue Queue,
   auto ZeQueue = pi_cast<ze_command_queue_handle_t *>(NativeHandle);
 
   // Extract a Level Zero compute queue handle from the given PI queue
+  auto &QueueGroup = Queue->getQueueGroup(false /*compute*/);
   uint32_t QueueGroupOrdinalUnused;
-  auto TID = std::this_thread::get_id();
-  auto &InitialGroup = Queue->ComputeQueueGroupsByTID.begin()->second;
-  const auto &Result =
-      Queue->ComputeQueueGroupsByTID.insert({TID, InitialGroup});
-  auto &ComputeQueueGroupRef = Result.first->second;
-
-  *ZeQueue = ComputeQueueGroupRef.getZeQueue(&QueueGroupOrdinalUnused);
+  *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused);
   return PI_SUCCESS;
 }
 
@@ -5557,10 +5531,9 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
   std::vector<pi_command_list_ptr_t> CmdLists;
 
   // There must be at least one L0 queue.
-  auto &InitialComputeGroup = Queue->ComputeQueueGroupsByTID.begin()->second;
-  auto &InitialCopyGroup = Queue->CopyQueueGroupsByTID.begin()->second;
-  PI_ASSERT(!InitialComputeGroup.ZeQueues.empty() ||
-                !InitialCopyGroup.ZeQueues.empty(),
+  auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get();
+  auto &CopyGroup = Queue->CopyQueueGroupsByTID.get();
+  PI_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(),
             PI_ERROR_INVALID_QUEUE);
 
   size_t NumQueues = 0;

@@ -468,16 +468,58 @@ struct _pi_queue : _pi_object {
     uint32_t NextIndex{0};
   };
 
+  // Helper class to facilitate per-thread queue groups
+  // We maintain a hashtable of queue groups if requested to do them per-thread.
+  // Otherwise it is just single entry used for all threads.
+  struct pi_queue_group_by_tid_t
+      : public std::unordered_map<std::thread::id, pi_queue_group_t> {
+    bool PerThread = false;
+
+    // Returns thread id if doing per-thread, or a generic id that represents
+    // all the threads.
+    std::thread::id tid() const {
+      return PerThread ? std::this_thread::get_id() : std::thread::id();
+    }
+
+    // Make the specified queue group be the master
+    void set(const pi_queue_group_t &QueueGroup) {
+      const auto &Device = QueueGroup.Queue->Device;
+      PerThread = Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue;
+      assert(empty());
+      insert({tid(), QueueGroup});
+    }
+
+    // Get a queue group to use for this thread
+    pi_queue_group_t &get() {
+      assert(!empty());
+      auto It = find(tid());
+      if (It != end()) {
+        return It->second;
+      }
+      // Add new queue group for this thread initialized from a master entry.
+      auto QueueGroup = begin()->second;
+      // Create space for queues and immediate commandlists, which are created
+      // on demand.
+      QueueGroup.ZeQueues = std::vector<ze_command_queue_handle_t>(
+          QueueGroup.ZeQueues.size(), nullptr);
+      QueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
+          QueueGroup.ZeQueues.size(), QueueGroup.Queue->CommandListMap.end());
+
+      std::tie(It, std::ignore) = insert({tid(), QueueGroup});
+      return It->second;
+    }
+  };
+
   // A map of compute groups containing compute queue handles, one per thread.
   // When a queue is accessed from multiple host threads, a separate queue group
   // is created for each thread. The key used for mapping is the thread ID.
-  std::unordered_map<std::thread::id, pi_queue_group_t> ComputeQueueGroupsByTID;
+  pi_queue_group_by_tid_t ComputeQueueGroupsByTID;
 
   // A group containing copy queue handles. The main copy engine, if available,
   // comes first followed by link copy engines, if available.
   // When a queue is accessed from multiple host threads, a separate queue group
   // is created for each thread. The key used for mapping is the thread ID.
-  std::unordered_map<std::thread::id, pi_queue_group_t> CopyQueueGroupsByTID;
+  pi_queue_group_by_tid_t CopyQueueGroupsByTID;
 
   // Wait for all commandlists associated with this Queue to finish operations.
   pi_result synchronize();