Improve async memory usage (apt-sim#344)

SeverinDiederichs · web-flow · commit 61812c3749de · 2025-02-06T09:06:11.000+01:00
This PR passes the debug levels properly to the async particle transport
again, now `adept/verbose` yields good debug printouts in the async
mode.

The handling of the GPUsteps is improved: instead of copying out the
full vector and making a shared pointer from each thread point to it
(this could cause issues if one worker was already done with its work,
not releasing the buffer), now just the steps that need to be processed
by that worker are copied to the queue of that worker.

Note that this is not the final implementation, since it is still too
slow and the GPU can easily run out of hitslots because the buffer is
not ready yet to be swapped back in, therefore a helper function is left
in the code and a few changes are not done to allow for flexibility to
test out the other approaches. Nonetheless, I already opened this PR
since the OOM crash when one worker finishes earlier than the rest might
be relevant right away.
diff --git a/include/AdePT/core/AsyncAdePTTransport.cuh b/include/AdePT/core/AsyncAdePTTransport.cuh
@@ -585,9 +585,18 @@ void HitProcessingLoop(HitProcessingContext *const context, GPUstate &gpuState,
     std::unique_lock lock(context->mutex);
     context->cv.wait(lock);
 
+    // Possible timing
+    // auto start = std::chrono::high_resolution_clock::now();
     gpuState.fHitScoring->TransferHitsToHost(context->hitTransferStream);
     const bool haveNewHits = gpuState.fHitScoring->ProcessHits();
 
+    // auto end = std::chrono::high_resolution_clock::now();
+    // std::chrono::duration<double> elapsed = end - start;
+
+    // if (haveNewHits) {
+    //     std::cout << "HIT Processing time: " << elapsed.count() << " seconds" << std::endl;
+    // }
+
     if (haveNewHits) {
       AdvanceEventStates(EventState::FlushingHits, EventState::HitsFlushed, eventStates);
       cvG4Workers.notify_all();
@@ -597,7 +606,7 @@ void HitProcessingLoop(HitProcessingContext *const context, GPUstate &gpuState,
 
 void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, TrackBuffer &trackBuffer, GPUstate &gpuState,
                    std::vector<std::atomic<EventState>> &eventStates, std::condition_variable &cvG4Workers,
-                   std::vector<AdePTScoring> &scoring, int adeptSeed)
+                   std::vector<AdePTScoring> &scoring, int adeptSeed, int debugLevel)
 {
   // NVTXTracer tracer{"TransportLoop"};
 
@@ -661,10 +670,9 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
       std::this_thread::sleep_for(10ms);
     }
 
-    // TODO: Pass debug level here
-    // if (fDebugLevel > 2) {
-    //   G4cout << "GPU transport starting" << std::endl;
-    // }
+    if (debugLevel > 2) {
+      G4cout << "GPU transport starting" << std::endl;
+    }
 
     COPCORE_CUDA_CHECK(cudaStreamSynchronize(gpuState.stream));
 
@@ -727,8 +735,7 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
           const auto nInject = std::min(toDevice.nTrack.load(), toDevice.maxTracks);
           toDevice.nTrack    = 0;
 
-          // TODO: Pass debug level here
-          // if (fDebugLevel > 3) std::cout << "Injecting " << nInject << " to GPU\n";
+          if (debugLevel > 3) std::cout << "Injecting " << nInject << " to GPU\n";
 
           // copy buffer of tracks to device
           COPCORE_CUDA_CHECK(cudaMemcpyAsync(trackBuffer.toDevice_dev.get(), toDevice.tracks,
@@ -963,8 +970,10 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
           hitProcessing->cv.notify_one();
         } else {
           if (gpuState.stats->hitBufferOccupancy >= gpuState.fHitScoring->HitCapacity() / 2 ||
-              std::any_of(eventStates.begin(), eventStates.end(),
-                          [](const auto &state) { return state == EventState::RequestHitFlush; })) {
+              gpuState.stats->hitBufferOccupancy >= 10000 ||
+              std::any_of(eventStates.begin(), eventStates.end(), [](const auto &state) {
+                return state.load(std::memory_order_acquire) == EventState::RequestHitFlush;
+              })) {
             AdvanceEventStates(EventState::RequestHitFlush, EventState::FlushingHits, eventStates);
             gpuState.fHitScoring->SwapDeviceBuffers(gpuState.stream);
             hitProcessing->cv.notify_one();
@@ -978,10 +987,7 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
         cvG4Workers.notify_all();
       }
 
-      // TODO: get fDebugLevel correctly and put prints back in.
-      int fDebugLevel = 0;
-      int fNThread    = numThreads;
-      if (fDebugLevel >= 3 && inFlight > 0 || (fDebugLevel >= 2 && iteration % 500 == 0)) {
+      if (debugLevel >= 3 && inFlight > 0 || (debugLevel >= 2 && iteration % 500 == 0)) {
         std::cerr << inFlight << " in flight ";
         std::cerr << "(" << gpuState.stats->inFlight[ParticleType::Electron] << " "
                   << gpuState.stats->inFlight[ParticleType::Positron] << " "
@@ -992,10 +998,11 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
         std::cerr << "\t slots:" << gpuState.stats->slotFillLevel << ", " << numLeaked << " leaked."
                   << "\tInjectState: " << static_cast<unsigned int>(gpuState.injectState.load())
                   << "\tExtractState: " << static_cast<unsigned int>(gpuState.extractState.load())
-                  << "\tHitBuffer: " << gpuState.stats->hitBufferOccupancy;
-        if (fDebugLevel >= 4) {
+                  << "\tHitBuffer: " << gpuState.stats->hitBufferOccupancy
+                  << "\tHitBufferReadyToSwap: " << gpuState.fHitScoring->ReadyToSwapBuffers();
+        if (debugLevel >= 4) {
           std::cerr << "\n\tper event: ";
-          for (unsigned int i = 0; i < fNThread; ++i) {
+          for (unsigned int i = 0; i < numThreads; ++i) {
             std::cerr << i << ": " << gpuState.stats->perEventInFlight[i]
                       << " (s=" << static_cast<unsigned short>(eventStates[i].load(std::memory_order_acquire)) << ")\t";
           }
@@ -1041,8 +1048,7 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
     ClearAllQueues<<<1, 1, 0, gpuState.stream>>>(queues);
     COPCORE_CUDA_CHECK(cudaStreamSynchronize(gpuState.stream));
 
-    // TODO
-    // if (fDebugLevel > 2) std::cout << "End transport loop.\n";
+    if (debugLevel > 2) std::cout << "End transport loop.\n";
   }
 
   hitProcessing->keepRunning = false;
@@ -1063,11 +1069,13 @@ std::shared_ptr<const std::vector<GPUHit>> GetGPUHits(unsigned int threadId, GPU
 // separate init function that will compile here and be called from the .icc
 std::thread LaunchGPUWorker(int trackCapacity, int scoringCapacity, int numThreads, TrackBuffer &trackBuffer,
                             GPUstate &gpuState, std::vector<std::atomic<EventState>> &eventStates,
-                            std::condition_variable &cvG4Workers, std::vector<AdePTScoring> &scoring, int adeptSeed)
+                            std::condition_variable &cvG4Workers, std::vector<AdePTScoring> &scoring, int adeptSeed,
+                            int debugLevel)
 {
-  return std::thread{&TransportLoop,        trackCapacity,      scoringCapacity,       numThreads,
-                     std::ref(trackBuffer), std::ref(gpuState), std::ref(eventStates), std::ref(cvG4Workers),
-                     std::ref(scoring),     adeptSeed};
+  return std::thread{
+      &TransportLoop,     trackCapacity,         scoringCapacity,       numThreads,        std::ref(trackBuffer),
+      std::ref(gpuState), std::ref(eventStates), std::ref(cvG4Workers), std::ref(scoring), adeptSeed,
+      debugLevel};
 }
 
 void FreeGPU(std::unique_ptr<AsyncAdePT::GPUstate, AsyncAdePT::GPUstateDeleter> &gpuState, G4HepEmState &g4hepem_state,
diff --git a/include/AdePT/core/AsyncAdePTTransport.hh b/include/AdePT/core/AsyncAdePTTransport.hh
@@ -44,7 +44,7 @@ private:
   unsigned short fNThread{0};       ///< Number of G4 workers
   unsigned int fTrackCapacity{0};   ///< Number of track slots to allocate on device
   unsigned int fScoringCapacity{0}; ///< Number of hit slots to allocate on device
-  int fDebugLevel{1};               ///< Debug level
+  int fDebugLevel{0};               ///< Debug level
   int fCUDAStackLimit{0};           ///< CUDA device stack limit
   std::vector<IntegrationLayer> fIntegrationLayerObjects;
   std::unique_ptr<GPUstate, GPUstateDeleter> fGPUstate{nullptr}; ///< CUDA state placeholder
diff --git a/include/AdePT/core/AsyncAdePTTransport.icc b/include/AdePT/core/AsyncAdePTTransport.icc
@@ -46,7 +46,7 @@ void FlushScoring(AdePTScoring &);
 std::shared_ptr<const std::vector<GPUHit>> GetGPUHits(unsigned int, AsyncAdePT::GPUstate &);
 std::thread LaunchGPUWorker(int, int, int, AsyncAdePT::TrackBuffer &, AsyncAdePT::GPUstate &,
                             std::vector<std::atomic<AsyncAdePT::EventState>> &, std::condition_variable &,
-                            std::vector<AdePTScoring> &, int);
+                            std::vector<AdePTScoring> &, int, int);
 std::unique_ptr<AsyncAdePT::GPUstate, AsyncAdePT::GPUstateDeleter> InitializeGPU(int trackCapacity, int scoringCapacity,
                                                                                  int numThreads,
                                                                                  AsyncAdePT::TrackBuffer &trackBuffer,
@@ -79,10 +79,10 @@ template <typename IntegrationLayer>
 AsyncAdePTTransport<IntegrationLayer>::AsyncAdePTTransport(AdePTConfiguration &configuration)
     : fNThread{(ushort)configuration.GetNumThreads()},
       fTrackCapacity{(uint)(1024 * 1024 * configuration.GetMillionsOfTrackSlots())},
-      fScoringCapacity{(uint)(1024 * 1024 * configuration.GetMillionsOfHitSlots())}, fDebugLevel{0},
-      fIntegrationLayerObjects(fNThread), fEventStates(fNThread), fGPUNetEnergy(fNThread, 0.0),
-      fTrackInAllRegions{configuration.GetTrackInAllRegions()}, fGPURegionNames{configuration.GetGPURegionNames()},
-      fCUDAStackLimit{configuration.GetCUDAStackLimit()}
+      fScoringCapacity{(uint)(1024 * 1024 * configuration.GetMillionsOfHitSlots())},
+      fDebugLevel{configuration.GetVerbosity()}, fIntegrationLayerObjects(fNThread), fEventStates(fNThread),
+      fGPUNetEnergy(fNThread, 0.0), fTrackInAllRegions{configuration.GetTrackInAllRegions()},
+      fGPURegionNames{configuration.GetGPURegionNames()}, fCUDAStackLimit{configuration.GetCUDAStackLimit()}
 {
   if (fNThread > kMaxThreads)
     throw std::invalid_argument("AsyncAdePTTransport limited to " + std::to_string(kMaxThreads) + " threads");
@@ -233,7 +233,7 @@ void AsyncAdePTTransport<IntegrationLayer>::Initialize()
 
   fGPUstate  = async_adept_impl::InitializeGPU(fTrackCapacity, fScoringCapacity, fNThread, *fBuffer, fScoring);
   fGPUWorker = async_adept_impl::LaunchGPUWorker(fTrackCapacity, fScoringCapacity, fNThread, *fBuffer, *fGPUstate,
-                                                 fEventStates, fCV_G4Workers, fScoring, fAdePTSeed);
+                                                 fEventStates, fCV_G4Workers, fScoring, fAdePTSeed, fDebugLevel);
 }
 
 template <typename IntegrationLayer>
@@ -263,11 +263,7 @@ void AsyncAdePTTransport<IntegrationLayer>::Flush(G4int threadId, G4int eventId)
 
     std::shared_ptr<const std::vector<GPUHit>> gpuHits;
     while ((gpuHits = async_adept_impl::GetGPUHits(threadId, *fGPUstate)) != nullptr) {
-      GPUHit dummy;
-      dummy.fEventId = eventId;
-      auto range     = std::equal_range(gpuHits->begin(), gpuHits->end(), dummy,
-                                        [](const GPUHit &lhs, const GPUHit &rhs) { return lhs.fEventId < rhs.fEventId; });
-      for (auto it = range.first; it != range.second; ++it) {
+      for (auto it = gpuHits->begin(); it != gpuHits->end(); ++it) {
         assert(it->threadId == threadId);
         integrationInstance.ProcessGPUHit(*it);
       }
diff --git a/include/AdePT/core/PerEventScoringImpl.cuh b/include/AdePT/core/PerEventScoringImpl.cuh
@@ -24,7 +24,7 @@
 
 // Comparison for sorting tracks into events on device:
 struct CompareGPUHits {
-  __device__ bool operator()(const GPUHit &lhs, const GPUHit &rhs) const { return lhs.fEventId < rhs.fEventId; }
+  __device__ bool operator()(const GPUHit &lhs, const GPUHit &rhs) const { return lhs.threadId < rhs.threadId; }
 };
 
 namespace AsyncAdePT {
@@ -71,18 +71,58 @@ class HitScoring {
   std::vector<std::deque<std::shared_ptr<const std::vector<GPUHit>>>> fHitQueues;
   mutable std::shared_mutex fProcessingHitsMutex;
 
+  using GPUHitVectorPtr = std::shared_ptr<const std::vector<GPUHit>>;
+  using HitDeque        = std::deque<GPUHitVectorPtr>;
+  using HitQueueVector  = std::vector<HitDeque>;
+
+  inline size_t calculateMemoryUsage(const HitQueueVector &fHitQueues)
+  {
+    size_t totalMemory = 0;
+
+    for (const auto &dq : fHitQueues) {
+      for (const auto &ptr : dq) {
+        if (ptr) {
+          totalMemory += sizeof(*ptr);
+          totalMemory += ptr->size() * sizeof(GPUHit); // Actual GPUHit data
+        }
+      }
+    }
+    return totalMemory;
+  }
+
   void ProcessBuffer(BufferHandle &handle)
   {
     // We are assuming that the caller holds a lock on fProcessingHitsMutex.
     if (handle.state == BufferHandle::State::NeedHostProcessing) {
-      auto hitVector = std::make_shared<std::vector<GPUHit>>();
-      hitVector->assign(handle.hostBuffer, handle.hostBuffer + handle.hitScoringInfo.fSlotCounter);
+
+      // std::cout << "Total Memory Used in fHitQueues: " << calculateMemoryUsage(fHitQueues) / 1024.0 / 1024.0 / 1024.0
+      // << " GB" << std::endl;
+      auto begin = handle.hostBuffer;
+      auto end   = handle.hostBuffer + handle.hitScoringInfo.fSlotCounter;
+
+      while (begin != end) {
+        short threadId = begin->threadId; // Get threadId of first hit in the range
+
+        // linear search, slower, doesn't require a sorted array
+        // auto threadEnd = std::find_if(begin, end,
+        //       [threadId](const GPUHit &hit) { return threadId != hit.threadId; });
+
+        // binary search, faster but requires a sorted array
+        auto threadEnd =
+            std::upper_bound(begin, end, threadId, [](short id, const GPUHit &hit) { return id < hit.threadId; });
+
+        // Copy hits into a unique pointer and push it to workers queue
+        auto HitsPerThread = std::make_unique<std::vector<GPUHit>>(begin, threadEnd);
+        fHitQueues[threadId].push_back(std::move(HitsPerThread));
+
+        begin = threadEnd; // set begin to start of the threadId
+      }
+
       handle.hitScoringInfo.fSlotCounter = 0;
       handle.state                       = BufferHandle::State::Free;
 
-      for (auto &hitQueue : fHitQueues) {
-        hitQueue.push_back(hitVector);
-      }
+      // std::cout << "After pushing hitVector: Total Memory Used in fHitQueues: " << calculateMemoryUsage(fHitQueues)
+      // / 1024.0 / 1024.0 / 1024.0 << " GB" << std::endl;
     }
   }
 
@@ -163,7 +203,15 @@ public:
         if (handle.state == BufferHandle::State::NeedHostProcessing) {
           if (!lock) lock.lock();
           haveNewHits = true;
+
+          // Possible timing
+          // auto start = std::chrono::high_resolution_clock::now();
           ProcessBuffer(handle);
+          // auto end = std::chrono::high_resolution_clock::now();
+          // std::chrono::duration<double> elapsed = end - start;
+          //     std::cout << "BUFFER Processing time: " << elapsed.count() << " seconds" << std::endl;
+
+          // lock.unlock();
         }
       }
     }