@@ -700,6 +700,10 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
700700 COPCORE_CUDA_CHECK (cudaStreamWaitEvent (waitingStream, cudaEvent));
701701 };
702702
703+ // needed for the HOTFIX below
704+ int injectIteration[numThreads];
705+ std::fill_n (injectIteration, numThreads, -1 );
706+
703707 std::unique_ptr<HitProcessingContext> hitProcessing{new HitProcessingContext{transferStream}};
704708 std::thread hitProcessingThread{&HitProcessingLoop, (HitProcessingContext *)hitProcessing.get (),
705709 std::ref (gpuState), std::ref (eventStates),
@@ -717,6 +721,15 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
717721 return {TransportThreads, transportBlocks};
718722 };
719723
724+ std::chrono::steady_clock::time_point startTime;
725+ if (debugLevel >= 2 ) {
726+ static bool isInitialized = false ;
727+ if (!isInitialized) {
728+ startTime = std::chrono::steady_clock::now ();
729+ isInitialized = true ;
730+ }
731+ }
732+
720733 while (gpuState.runTransport ) {
721734 // NVTXTracer nvtx1{"Setup"}, nvtx2{"Setup2"};
722735 InitSlotManagers<<<80 , 256 , 0 , gpuState.stream>>> (gpuState.slotManager_dev , gpuState.nSlotManager_dev );
@@ -757,11 +770,6 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
757770 // {EventState::ScoringRetrieved, "ScoringRetrieved"}};
758771 // #endif
759772
760- std::chrono::steady_clock::time_point startTime;
761- if (debugLevel >= 2 ) {
762- startTime = std::chrono::steady_clock::now ();
763- }
764-
765773 for (unsigned int iteration = 0 ;
766774 inFlight > 0 || gpuState.injectState != InjectState::Idle || gpuState.extractState != ExtractState::Idle ||
767775 std::any_of (eventStates.begin (), eventStates.end (), needTransport);
@@ -789,20 +797,37 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
789797 // *** Particle injection ***
790798 // --------------------------
791799 if (gpuState.injectState == InjectState::Idle) {
792- for (auto &eventState : eventStates) {
793- if (const auto state = eventState.load (std::memory_order_acquire); state == EventState::G4RequestsFlush) {
794- eventState = EventState::Inject;
800+ for (int i = 0 ; i < numThreads; ++i) {
801+ auto &eventState = eventStates[i];
802+ const auto state = eventState.load (std::memory_order_acquire);
803+
804+ // HOTFIX:
805+ // the current problem is that the injected particle takes too long before it shows up
806+ // until then, the EventState has counted up and requested a Flush of the event, before the particle was
807+ // injected leading to the particle being scored in the wrong event. By not going up into the
808+ // InjectionCompleted state for some hardcoded number of iterations, this is prevented. FIXME to be fixed
809+ // properly
810+ if (state == EventState::G4RequestsFlush) {
811+ eventState.store (EventState::Inject, std::memory_order_release);
812+ injectIteration[i] = iteration; // store iteration
795813 } else if (state == EventState::Inject) {
796- eventState = EventState::InjectionCompleted;
814+ if (iteration - injectIteration[i] >= 25 ) {
815+ eventState.store (EventState::InjectionCompleted, std::memory_order_release);
816+ injectIteration[i] = -1 ;
817+ } // else: wait more iterations
818+ }
819+ // If eventState leaves Inject unexpectedly, reset iteration tracking:
820+ else if (injectIteration[i] != -1 && state != EventState::Inject) {
821+ injectIteration[i] = -1 ;
797822 }
798823 }
799824
800- if (auto &toDevice = trackBuffer.getActiveBuffer (); toDevice.nTrack > 0 ) {
825+ if (auto &toDevice = trackBuffer.getActiveBuffer (); toDevice.nTrack . load (std::memory_order_acquire) > 0 ) {
801826 gpuState.injectState = InjectState::CreatingSlots;
802827
803828 trackBuffer.swapToDeviceBuffers ();
804829 std::scoped_lock lock{toDevice.mutex };
805- const auto nInject = std::min (toDevice.nTrack .load (), toDevice.maxTracks );
830+ const auto nInject = std::min (toDevice.nTrack .load (std::memory_order_acquire ), toDevice.maxTracks );
806831 toDevice.nTrack = 0 ;
807832
808833 if (debugLevel > 3 ) std::cout << " Injecting " << nInject << " to GPU\n " ;
@@ -1047,8 +1072,7 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
10471072 if (state == EventState::WaitingForTransportToFinish && gpuState.stats ->perEventInFlight [threadId] == 0 ) {
10481073 eventStates[threadId] = EventState::RequestHitFlush;
10491074 }
1050- if (EventState::RequestHitFlush <= state && state < EventState::LeakedTracksRetrieved &&
1051- gpuState.stats ->perEventInFlight [threadId] != 0 ) {
1075+ if (state >= EventState::RequestHitFlush && gpuState.stats ->perEventInFlight [threadId] != 0 ) {
10521076 std::cerr << " ERROR thread " << threadId << " is in state " << static_cast <unsigned int >(state)
10531077 << " and occupancy is " << gpuState.stats ->perEventInFlight [threadId] << " \n " ;
10541078 }
@@ -1074,8 +1098,9 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
10741098 }
10751099
10761100 // *** Notify G4 workers if their events completed ***
1077- if (std::any_of (eventStates.begin (), eventStates.end (),
1078- [](const EventState &state) { return state == EventState::DeviceFlushed; })) {
1101+ if (std::any_of (eventStates.begin (), eventStates.end (), [](const std::atomic<EventState> &state) {
1102+ return state.load (std::memory_order_acquire) == EventState::DeviceFlushed;
1103+ })) {
10791104 // Notify HitProcessingThread to notify the workers. Do not notify workers directly, as this could bypass the
10801105 // processing of hits
10811106 hitProcessing->cv .notify_one ();
0 commit comments