Skip to content

Commit

Permalink
Be more consistent and conservative with GPU syncing on D3D12
Browse files Browse the repository at this point in the history
* In some cases we were only syncing one queue when we needed to flush and sync
  the whole GPU. Rename functions to be more clear about what is being synced,
  and only sync one queue/our internal queue when we know that's the only work
  we need to wait on.
  • Loading branch information
baldurk committed Jan 7, 2025
1 parent 74a6e28 commit 64c77e9
Show file tree
Hide file tree
Showing 11 changed files with 97 additions and 91 deletions.
14 changes: 7 additions & 7 deletions renderdoc/driver/d3d12/d3d12_command_queue_wrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ bool WrappedID3D12CommandQueue::Serialise_ExecuteCommandLists(SerialiserType &se
ToStr(GetResourceManager()->GetOriginalID(m_PrevQueueId)).c_str(),
ToStr(GetResourceManager()->GetOriginalID(GetResID(pQueue))).c_str());
if(m_PrevQueueId != ResourceId())
m_pDevice->GPUSync(GetResourceManager()->GetCurrentAs<ID3D12CommandQueue>(m_PrevQueueId));
m_pDevice->DeviceWaitForIdle();

m_PrevQueueId = GetResID(pQueue);
}
Expand All @@ -493,7 +493,7 @@ bool WrappedID3D12CommandQueue::Serialise_ExecuteCommandLists(SerialiserType &se
ID3D12CommandList *list = Unwrap(ppCommandLists[i]);
real->ExecuteCommandLists(1, &list);
if(D3D12_Debug_SingleSubmitFlushing() || D3D12_Debug_RT_Auditing())
m_pDevice->GPUSync();
m_pDevice->DeviceWaitForIdle();

BakedCmdListInfo &info = m_Cmd.m_BakedCmdListInfo[cmd];

Expand Down Expand Up @@ -582,7 +582,7 @@ bool WrappedID3D12CommandQueue::Serialise_ExecuteCommandLists(SerialiserType &se
if(!info.executeEvents.empty())
{
// ensure all GPU work has finished for readback of arguments
m_pDevice->GPUSync();
m_pDevice->DeviceWaitForIdle();

if(m_pDevice->HasFatalError())
return false;
Expand Down Expand Up @@ -778,7 +778,7 @@ bool WrappedID3D12CommandQueue::Serialise_ExecuteCommandLists(SerialiserType &se
for(size_t i = 0; i < rerecordedCmds.size(); i++)
{
real->ExecuteCommandLists(1, &rerecordedCmds[i]);
m_pDevice->GPUSync();
m_pDevice->DeviceWaitForIdle();
}
}
else
Expand Down Expand Up @@ -1092,7 +1092,7 @@ void WrappedID3D12CommandQueue::ExecuteCommandListsInternal(UINT NumCommandLists
queueReadback.list->Close();
ID3D12CommandList *listptr = Unwrap(queueReadback.list);
queueReadback.unwrappedQueue->ExecuteCommandLists(1, &listptr);
m_pDevice->GPUSync(queueReadback.unwrappedQueue, Unwrap(queueReadback.fence));
m_pDevice->QueueWaitForIdle(queueReadback.unwrappedQueue, Unwrap(queueReadback.fence));

data = queueReadback.readbackMapped;
}
Expand Down Expand Up @@ -1397,7 +1397,7 @@ bool WrappedID3D12CommandQueue::Serialise_Signal(SerialiserType &ser, ID3D12Fenc
if(IsReplayingAndReading() && pFence)
{
m_pReal->Signal(Unwrap(pFence), Value);
m_pDevice->GPUSync(pQueue);
m_pDevice->DeviceWaitForIdle();
}

return true;
Expand Down Expand Up @@ -1435,7 +1435,7 @@ bool WrappedID3D12CommandQueue::Serialise_Wait(SerialiserType &ser, ID3D12Fence

if(IsReplayingAndReading() && pFence)
{
m_pDevice->GPUSync(pQueue);
m_pDevice->DeviceWaitForIdle();
}

return true;
Expand Down
2 changes: 1 addition & 1 deletion renderdoc/driver/d3d12/d3d12_counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ rdcarray<CounterResult> D3D12Replay::FetchCounters(const rdcarray<GPUCounter> &c

m_pDevice->ExecuteLists();
m_pDevice->FlushLists();
m_pDevice->GPUSyncAllQueues();
m_pDevice->DeviceWaitForIdle();

D3D12_RANGE range;
range.Begin = 0;
Expand Down
6 changes: 3 additions & 3 deletions renderdoc/driver/d3d12/d3d12_debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2116,7 +2116,7 @@ void D3D12DebugManager::GetBufferData(ID3D12Resource *buffer, uint64_t offset, u
if(buffer == NULL)
return;

m_pDevice->GPUSyncAllQueues();
m_pDevice->ReplayWorkWaitForIdle();

D3D12_RESOURCE_DESC desc = buffer->GetDesc();
D3D12_HEAP_PROPERTIES heapProps = {};
Expand Down Expand Up @@ -2207,7 +2207,7 @@ void D3D12DebugManager::GetBufferData(ID3D12Resource *buffer, uint64_t offset, u

ID3D12CommandList *l = m_DebugList;
m_pDevice->GetQueue()->ExecuteCommandLists(1, &l);
m_pDevice->GPUSync();
m_pDevice->InternalQueueWaitForIdle();
m_DebugAlloc->Reset();

D3D12_RANGE range = {0, (size_t)chunkSize};
Expand Down Expand Up @@ -2247,7 +2247,7 @@ void D3D12DebugManager::GetBufferData(ID3D12Resource *buffer, uint64_t offset, u

ID3D12CommandList *l = m_DebugList;
m_pDevice->GetQueue()->ExecuteCommandLists(1, &l);
m_pDevice->GPUSync();
m_pDevice->InternalQueueWaitForIdle();
m_DebugAlloc->Reset();
}

Expand Down
91 changes: 46 additions & 45 deletions renderdoc/driver/d3d12/d3d12_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -678,9 +678,9 @@ WrappedID3D12Device::WrappedID3D12Device(ID3D12Device *realDevice, D3D12InitPara
m_HeaderChunk = NULL;

m_Alloc = m_DataUploadAlloc = NULL;
m_GPUSyncFence = NULL;
m_GPUSyncHandle = NULL;
m_GPUSyncCounter = 0;
m_WFIFence = NULL;
m_WFIHandle = NULL;
m_WFICounter = 0;
m_OverlaySyncHandle = NULL;

initStateCurBatch = 0;
Expand Down Expand Up @@ -900,12 +900,9 @@ WrappedID3D12Device::~WrappedID3D12Device()
for(size_t i = 0; i < m_InternalCmds.freecmds.size(); i++)
SAFE_RELEASE(m_InternalCmds.freecmds[i]);

DeviceWaitForIdle();
for(size_t i = 0; i < m_QueueFences.size(); i++)
{
GPUSync(m_Queues[i], m_QueueFences[i]);

SAFE_RELEASE(m_QueueFences[i]);
}

for(auto it = m_UploadBuffers.begin(); it != m_UploadBuffers.end(); ++it)
{
Expand Down Expand Up @@ -2180,7 +2177,7 @@ bool WrappedID3D12Device::Serialise_MapDataWrite(SerialiserType &ser, ID3D12Reso
m_CurDataUpload++;
if(m_CurDataUpload == ARRAY_COUNT(m_DataUploadList))
{
GPUSync();
InternalQueueWaitForIdle();
m_CurDataUpload = 0;
}
}
Expand Down Expand Up @@ -2331,7 +2328,7 @@ bool WrappedID3D12Device::Serialise_WriteToSubresource(SerialiserType &ser, ID3D
m_CurDataUpload++;
if(m_CurDataUpload == ARRAY_COUNT(m_DataUploadList))
{
GPUSync();
InternalQueueWaitForIdle();
m_CurDataUpload = 0;
}
}
Expand Down Expand Up @@ -2728,7 +2725,7 @@ void WrappedID3D12Device::StartFrameCapture(DeviceOwnedWindow devWnd)
initStateCurBatch = 0;
initStateCurList = NULL;

GPUSyncAllQueues();
DeviceWaitForIdle();

// wait until we've synced all queues to check for these
GetResourceManager()->GetRTManager()->TickASManagement();
Expand Down Expand Up @@ -2863,7 +2860,7 @@ bool WrappedID3D12Device::EndFrameCapture(DeviceOwnedWindow devWnd)

m_State = CaptureState::BackgroundCapturing;

GPUSync();
DeviceWaitForIdle();
}

rdcarray<MapState> maps = GetMaps();
Expand Down Expand Up @@ -3217,7 +3214,7 @@ bool WrappedID3D12Device::DiscardFrameCapture(DeviceOwnedWindow devWnd)

m_State = CaptureState::BackgroundCapturing;

GPUSync();
DeviceWaitForIdle();

queues = m_Queues;
}
Expand Down Expand Up @@ -4468,10 +4465,10 @@ void WrappedID3D12Device::CreateInternalResources()
CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, __uuidof(ID3D12CommandAllocator),
(void **)&m_Alloc);
InternalRef();
CreateFence(0, D3D12_FENCE_FLAG_NONE, __uuidof(ID3D12Fence), (void **)&m_GPUSyncFence);
m_GPUSyncFence->SetName(L"m_GPUSyncFence");
CreateFence(0, D3D12_FENCE_FLAG_NONE, __uuidof(ID3D12Fence), (void **)&m_WFIFence);
m_WFIFence->SetName(L"m_WFIFence");
InternalRef();
m_GPUSyncHandle = ::CreateEvent(NULL, FALSE, FALSE, NULL);
m_WFIHandle = ::CreateEvent(NULL, FALSE, FALSE, NULL);

CreateFence(0, D3D12_FENCE_FLAG_NONE, __uuidof(ID3D12Fence), (void **)&m_OverlayFence);
m_OverlayFence->SetName(L"m_OverlayFence");
Expand All @@ -4493,7 +4490,7 @@ void WrappedID3D12Device::CreateInternalResources()
}

GetResourceManager()->SetInternalResource(m_Alloc);
GetResourceManager()->SetInternalResource(m_GPUSyncFence);
GetResourceManager()->SetInternalResource(m_WFIFence);

CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, __uuidof(ID3D12CommandAllocator),
(void **)&m_DataUploadAlloc);
Expand Down Expand Up @@ -4542,7 +4539,7 @@ void WrappedID3D12Device::CreateInternalResources()
RDCERR("Failed to create RTV heap");
}

m_GPUSyncCounter = 0;
m_WFICounter = 0;

if(m_TextRenderer == NULL)
m_TextRenderer = new D3D12TextRenderer(this);
Expand All @@ -4555,7 +4552,7 @@ void WrappedID3D12Device::CreateInternalResources()

void WrappedID3D12Device::DestroyInternalResources()
{
if(m_GPUSyncHandle == NULL)
if(m_WFIHandle == NULL)
return;

SAFE_RELEASE(m_pAMDExtObject);
Expand Down Expand Up @@ -4585,67 +4582,71 @@ void WrappedID3D12Device::DestroyInternalResources()
}

SAFE_RELEASE(m_Alloc);
SAFE_RELEASE(m_GPUSyncFence);
SAFE_RELEASE(m_WFIFence);
SAFE_RELEASE(m_OverlayFence);
CloseHandle(m_GPUSyncHandle);
CloseHandle(m_WFIHandle);
CloseHandle(m_OverlaySyncHandle);
}

void WrappedID3D12Device::DataUploadSync()
{
if(m_CurDataUpload >= 0)
{
GPUSync();
InternalQueueWaitForIdle();
m_CurDataUpload = 0;
}
}

void WrappedID3D12Device::GPUSync(ID3D12CommandQueue *queue, ID3D12Fence *fence)
void WrappedID3D12Device::InternalQueueWaitForIdle()
{
QueueWaitForIdle(GetQueue(), m_WFIFence);
}

void WrappedID3D12Device::QueueWaitForIdle(ID3D12CommandQueue *queue, ID3D12Fence *fence)
{
m_GPUSyncCounter++;
m_WFICounter++;

if(HasFatalError())
return;

if(queue == NULL)
queue = GetQueue();

if(fence == NULL)
fence = m_GPUSyncFence;

HRESULT hr = queue->Signal(fence, m_GPUSyncCounter);
HRESULT hr = queue->Signal(fence, m_WFICounter);
CHECK_HR(this, hr);
RDCASSERTEQUAL(hr, S_OK);

fence->SetEventOnCompletion(m_GPUSyncCounter, m_GPUSyncHandle);
fence->SetEventOnCompletion(m_WFICounter, m_WFIHandle);

// wait 10s for hardware GPUs, 100s for CPU
if(m_Replay && m_Replay->GetDriverInfo().vendor == GPUVendor::Software)
WaitForSingleObject(m_GPUSyncHandle, 100000);
WaitForSingleObject(m_WFIHandle, 100000);
else
WaitForSingleObject(m_GPUSyncHandle, 10000);
WaitForSingleObject(m_WFIHandle, 10000);

hr = m_pDevice->GetDeviceRemovedReason();
CHECK_HR(this, hr);
RDCASSERTEQUAL(hr, S_OK);
}

void WrappedID3D12Device::GPUSyncAllQueues()
void WrappedID3D12Device::ReplayWorkWaitForIdle()
{
if(m_GPUSynced)
if(m_WaitedForIdleAfterReplay)
return;

for(size_t i = 0; i < m_QueueFences.size(); i++)
GPUSync(m_Queues[i], m_QueueFences[i]);
DeviceWaitForIdle();

m_GPUSynced = true;
m_WaitedForIdleAfterReplay = true;
}

void WrappedID3D12Device::DeviceWaitForIdle()
{
for(size_t i = 0; i < m_QueueFences.size(); i++)
QueueWaitForIdle(m_Queues[i], m_QueueFences[i]);
}

ID3D12GraphicsCommandListX *WrappedID3D12Device::GetNewList()
{
ID3D12GraphicsCommandListX *ret = NULL;

m_GPUSynced = false;
m_WaitedForIdleAfterReplay = false;

if(!m_InternalCmds.freecmds.empty())
{
Expand Down Expand Up @@ -4783,7 +4784,7 @@ void WrappedID3D12Device::FlushLists(bool forceSync, ID3D12CommandQueue *queue)

if(!m_InternalCmds.submittedcmds.empty() || forceSync)
{
GPUSync(queue);
QueueWaitForIdle(queue, m_WFIFence);

if(!m_InternalCmds.submittedcmds.empty())
m_InternalCmds.freecmds.append(m_InternalCmds.submittedcmds);
Expand Down Expand Up @@ -5382,22 +5383,22 @@ void WrappedID3D12Device::ReplayLog(uint32_t startEventID, uint32_t endEventID,
{
bool partial = true;

m_GPUSynced = false;
m_WaitedForIdleAfterReplay = false;

if(startEventID == 0 && (replayType == eReplay_WithoutDraw || replayType == eReplay_Full))
{
startEventID = 1;
partial = false;

m_GPUSyncCounter++;
m_WFICounter++;

GPUSyncAllQueues();
DeviceWaitForIdle();

// I'm not sure the reason for this, but the debug layer warns about being unable to resubmit
// command lists due to the 'previous queue fence' not being ready yet, even if no fences are
// signalled or waited. So instead we just signal a dummy fence each new 'frame'
for(size_t i = 0; i < m_Queues.size(); i++)
CHECK_HR(this, m_Queues[i]->Signal(m_QueueFences[i], m_GPUSyncCounter));
CHECK_HR(this, m_Queues[i]->Signal(m_QueueFences[i], m_WFICounter));

FlushLists(true);
m_CurDataUpload = 0;
Expand All @@ -5423,7 +5424,7 @@ void WrappedID3D12Device::ReplayLog(uint32_t startEventID, uint32_t endEventID,
ExecuteLists();
FlushLists(true);

GPUSyncAllQueues();
DeviceWaitForIdle();

// clear any previous ray dispatch references
D3D12CommandData &cmd = *m_Queue->GetCommandData();
Expand Down
22 changes: 15 additions & 7 deletions renderdoc/driver/d3d12/d3d12_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -595,8 +595,8 @@ class WrappedID3D12Device : public IFrameCapturer, public ID3DDevice, public ID3
rdcarray<WrappedID3D12CommandQueue *> m_Queues;
rdcarray<ID3D12Fence *> m_QueueFences;

// if we've called GPUSyncAllQueues since the last replay
bool m_GPUSynced = false;
// if we've called ReplayWorkWaitForIdle since the last replay or internal work
bool m_WaitedForIdleAfterReplay = false;

// list of queues and buffers kept alive during capture artificially even if the user destroys
// them, so we can use them in the capture. Storing this separately prevents races where a
Expand Down Expand Up @@ -634,9 +634,9 @@ class WrappedID3D12Device : public IFrameCapturer, public ID3DDevice, public ID3
ID3D12GraphicsCommandList *m_DataUploadList[64] = {};
size_t m_CurDataUpload = 0;
ID3D12DescriptorHeap *m_RTVHeap = NULL;
ID3D12Fence *m_GPUSyncFence;
HANDLE m_GPUSyncHandle;
UINT64 m_GPUSyncCounter;
ID3D12Fence *m_WFIFence;
HANDLE m_WFIHandle;
UINT64 m_WFICounter;

ID3D12Fence *m_OverlayFence = NULL;
UINT64 m_CurOverlay = 0;
Expand Down Expand Up @@ -1073,8 +1073,16 @@ class WrappedID3D12Device : public IFrameCapturer, public ID3DDevice, public ID3

void DataUploadSync();

void GPUSync(ID3D12CommandQueue *queue = NULL, ID3D12Fence *fence = NULL);
void GPUSyncAllQueues();
// Sync a single queue, by submitting the fence then waiting on it
void QueueWaitForIdle(ID3D12CommandQueue *queue, ID3D12Fence *fence);
// Sync to the internal queue - used to ensure any internal work has finished (e.g. FlushLists() above)
// or generally any internal command buffers submitted to the GetQueue() main internal queue.
void InternalQueueWaitForIdle();
// Sync all queues - this always flushes the entire GPU
void DeviceWaitForIdle();
// Sync all queues but only once after each replay or internal work submit. used when fetching data
// or after a replay to ensure work completes on all captured queues before doing any analysis work
void ReplayWorkWaitForIdle();

RDCDriver GetFrameCaptureDriver() { return RDCDriver::D3D12; }
void StartFrameCapture(DeviceOwnedWindow devWnd);
Expand Down
Loading

0 comments on commit 64c77e9

Please sign in to comment.