Skip to content

Commit 73d905a

Browse files
hotfix for injection in Async AdePT (#367)
1 parent ef6e5ea commit 73d905a

File tree

2 files changed

+42
-17
lines changed

2 files changed

+42
-17
lines changed

include/AdePT/core/AsyncAdePTTransport.cuh

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,10 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
700700
COPCORE_CUDA_CHECK(cudaStreamWaitEvent(waitingStream, cudaEvent));
701701
};
702702

703+
// needed for the HOTFIX below
704+
int injectIteration[numThreads];
705+
std::fill_n(injectIteration, numThreads, -1);
706+
703707
std::unique_ptr<HitProcessingContext> hitProcessing{new HitProcessingContext{transferStream}};
704708
std::thread hitProcessingThread{&HitProcessingLoop, (HitProcessingContext *)hitProcessing.get(),
705709
std::ref(gpuState), std::ref(eventStates),
@@ -717,6 +721,15 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
717721
return {TransportThreads, transportBlocks};
718722
};
719723

724+
std::chrono::steady_clock::time_point startTime;
725+
if (debugLevel >= 2) {
726+
static bool isInitialized = false;
727+
if (!isInitialized) {
728+
startTime = std::chrono::steady_clock::now();
729+
isInitialized = true;
730+
}
731+
}
732+
720733
while (gpuState.runTransport) {
721734
// NVTXTracer nvtx1{"Setup"}, nvtx2{"Setup2"};
722735
InitSlotManagers<<<80, 256, 0, gpuState.stream>>>(gpuState.slotManager_dev, gpuState.nSlotManager_dev);
@@ -757,11 +770,6 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
757770
// {EventState::ScoringRetrieved, "ScoringRetrieved"}};
758771
// #endif
759772

760-
std::chrono::steady_clock::time_point startTime;
761-
if (debugLevel >= 2) {
762-
startTime = std::chrono::steady_clock::now();
763-
}
764-
765773
for (unsigned int iteration = 0;
766774
inFlight > 0 || gpuState.injectState != InjectState::Idle || gpuState.extractState != ExtractState::Idle ||
767775
std::any_of(eventStates.begin(), eventStates.end(), needTransport);
@@ -789,20 +797,37 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
789797
// *** Particle injection ***
790798
// --------------------------
791799
if (gpuState.injectState == InjectState::Idle) {
792-
for (auto &eventState : eventStates) {
793-
if (const auto state = eventState.load(std::memory_order_acquire); state == EventState::G4RequestsFlush) {
794-
eventState = EventState::Inject;
800+
for (int i = 0; i < numThreads; ++i) {
801+
auto &eventState = eventStates[i];
802+
const auto state = eventState.load(std::memory_order_acquire);
803+
804+
// HOTFIX:
805+
// the current problem is that the injected particle takes too long before it shows up
806+
// until then, the EventState has counted up and requested a Flush of the event, before the particle was
807+
// injected leading to the particle being scored in the wrong event. By not going up into the
808+
// InjectionCompleted state for some hardcoded number of iterations, this is prevented. FIXME to be fixed
809+
// properly
810+
if (state == EventState::G4RequestsFlush) {
811+
eventState.store(EventState::Inject, std::memory_order_release);
812+
injectIteration[i] = iteration; // store iteration
795813
} else if (state == EventState::Inject) {
796-
eventState = EventState::InjectionCompleted;
814+
if (iteration - injectIteration[i] >= 25) {
815+
eventState.store(EventState::InjectionCompleted, std::memory_order_release);
816+
injectIteration[i] = -1;
817+
} // else: wait more iterations
818+
}
819+
// If eventState leaves Inject unexpectedly, reset iteration tracking:
820+
else if (injectIteration[i] != -1 && state != EventState::Inject) {
821+
injectIteration[i] = -1;
797822
}
798823
}
799824

800-
if (auto &toDevice = trackBuffer.getActiveBuffer(); toDevice.nTrack > 0) {
825+
if (auto &toDevice = trackBuffer.getActiveBuffer(); toDevice.nTrack.load(std::memory_order_acquire) > 0) {
801826
gpuState.injectState = InjectState::CreatingSlots;
802827

803828
trackBuffer.swapToDeviceBuffers();
804829
std::scoped_lock lock{toDevice.mutex};
805-
const auto nInject = std::min(toDevice.nTrack.load(), toDevice.maxTracks);
830+
const auto nInject = std::min(toDevice.nTrack.load(std::memory_order_acquire), toDevice.maxTracks);
806831
toDevice.nTrack = 0;
807832

808833
if (debugLevel > 3) std::cout << "Injecting " << nInject << " to GPU\n";
@@ -1047,8 +1072,7 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
10471072
if (state == EventState::WaitingForTransportToFinish && gpuState.stats->perEventInFlight[threadId] == 0) {
10481073
eventStates[threadId] = EventState::RequestHitFlush;
10491074
}
1050-
if (EventState::RequestHitFlush <= state && state < EventState::LeakedTracksRetrieved &&
1051-
gpuState.stats->perEventInFlight[threadId] != 0) {
1075+
if (state >= EventState::RequestHitFlush && gpuState.stats->perEventInFlight[threadId] != 0) {
10521076
std::cerr << "ERROR thread " << threadId << " is in state " << static_cast<unsigned int>(state)
10531077
<< " and occupancy is " << gpuState.stats->perEventInFlight[threadId] << "\n";
10541078
}
@@ -1074,8 +1098,9 @@ void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, Track
10741098
}
10751099

10761100
// *** Notify G4 workers if their events completed ***
1077-
if (std::any_of(eventStates.begin(), eventStates.end(),
1078-
[](const EventState &state) { return state == EventState::DeviceFlushed; })) {
1101+
if (std::any_of(eventStates.begin(), eventStates.end(), [](const std::atomic<EventState> &state) {
1102+
return state.load(std::memory_order_acquire) == EventState::DeviceFlushed;
1103+
})) {
10791104
// Notify HitProcessingThread to notify the workers. Do not notify workers directly, as this could bypass the
10801105
// processing of hits
10811106
hitProcessing->cv.notify_one();

include/AdePT/core/AsyncAdePTTransport.icc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,9 +331,9 @@ void AsyncAdePTTransport<IntegrationLayer>::ProcessGPUSteps(int threadId, int ev
331331
std::cerr << "\033[1;31mError, threadId doesn't match it->threadId " << it->threadId << " threadId " << threadId
332332
<< "\033[0m" << std::endl;
333333
if (it->fEventId != eventId) {
334-
std::cerr << "\033[1;31mError, eventId doesn't match it->fEventId " << it->fEventId << "eventId " << eventId
334+
std::cerr << "\033[1;31mError, eventId doesn't match it->fEventId " << it->fEventId << " eventId " << eventId
335335
<< " num hits to be processed " << (range.second - range.first) << " dataOnBuffer " << dataOnBuffer
336-
<< "state : " << static_cast<unsigned int>(fEventStates[threadId].load(std::memory_order_acquire))
336+
<< " state : " << static_cast<unsigned int>(fEventStates[threadId].load(std::memory_order_acquire))
337337
<< "\033[0m" << std::endl;
338338
}
339339
integrationInstance.ProcessGPUStep(*it, fReturnAllSteps, fReturnLastStep);

0 commit comments

Comments
 (0)