Skip to content

Commit 38556ce

Browse files
Allow blocked command stream programming for commands without Kernel
Change-Id: I691a029bd5511c8f710ef1bff8cc5a9feca644f3 Signed-off-by: Dunajski, Bartosz <[email protected]> Related-To: NEO-3433
1 parent 55a1dda commit 38556ce

17 files changed

+247
-95
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!groovy
22
dependenciesRevision='06357fd1499ba888288e517541564865ad9c136a-1292'
33
strategy='EQUAL'
4-
allowedCD=259
4+
allowedCD=260
55
allowedF=5

runtime/command_queue/command_queue.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,4 +588,27 @@ bool CommandQueue::blitEnqueueAllowed(bool queueBlocked, cl_command_type cmdType
588588

589589
return commandAllowed && !queueBlocked && blitAllowed;
590590
}
591+
592+
bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const {
593+
if (!blockedQueue) {
594+
return false;
595+
}
596+
597+
if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType)) {
598+
return true;
599+
}
600+
601+
if ((CL_COMMAND_BARRIER == commandType || CL_COMMAND_MARKER == commandType) &&
602+
getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
603+
604+
for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
605+
auto waitlistEvent = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
606+
if (waitlistEvent->getTimestampPacketNodes()) {
607+
return true;
608+
}
609+
}
610+
}
611+
612+
return false;
613+
}
591614
} // namespace NEO

runtime/command_queue/command_queue.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
314314

315315
MOCKABLE_VIRTUAL void updateFromCompletionStamp(const CompletionStamp &completionStamp);
316316

317-
virtual bool isCacheFlushCommand(uint32_t commandType) { return false; }
317+
virtual bool isCacheFlushCommand(uint32_t commandType) const { return false; }
318318

319319
cl_int getCommandQueueInfo(cl_command_queue_info paramName,
320320
size_t paramValueSize, void *paramValue,
@@ -429,6 +429,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
429429
cl_int enqueueUnmapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest);
430430

431431
virtual void obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType){};
432+
bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const;
432433

433434
MOCKABLE_VIRTUAL void obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies);
434435
void processProperties(const cl_queue_properties *properties);

runtime/command_queue/command_queue_hw.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ class CommandQueueHw : public CommandQueue {
367367
LinearStream *commandStream,
368368
uint64_t postSyncAddress);
369369

370-
bool isCacheFlushCommand(uint32_t commandType) override;
370+
bool isCacheFlushCommand(uint32_t commandType) const override;
371371

372372
protected:
373373
MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){};
@@ -389,10 +389,11 @@ class CommandQueueHw : public CommandQueue {
389389
LinearStream *obtainCommandStream(const CsrDependencies &csrDependencies, bool profilingRequired,
390390
bool perfCountersRequired, bool blitEnqueue, bool blockedQueue,
391391
const MultiDispatchInfo &multiDispatchInfo,
392+
const EventsRequest &eventsRequest,
392393
std::unique_ptr<KernelOperation> &blockedCommandsData,
393394
Surface **surfaces, size_t numSurfaces) {
394395
LinearStream *commandStream = nullptr;
395-
if (blockedQueue && !multiDispatchInfo.empty()) {
396+
if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue)) {
396397
constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize;
397398
constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;
398399
commandStream = new LinearStream();

runtime/command_queue/command_queue_hw_bdw_plus.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ void CommandQueueHw<GfxFamily>::submitCacheFlush(Surface **surfaces,
1717
}
1818

1919
template <typename GfxFamily>
20-
bool CommandQueueHw<GfxFamily>::isCacheFlushCommand(uint32_t commandType) {
20+
bool CommandQueueHw<GfxFamily>::isCacheFlushCommand(uint32_t commandType) const {
2121
return false;
2222
}
2323

runtime/command_queue/enqueue_common.h

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
228228
}
229229

230230
auto &commandStream = *obtainCommandStream<commandType>(csrDeps, profilingRequired, perfCountersRequired, blitEnqueue, blockQueue,
231-
multiDispatchInfo, blockedCommandsData, surfacesForResidency, numSurfaceForResidency);
231+
multiDispatchInfo, eventsRequest, blockedCommandsData, surfacesForResidency,
232+
numSurfaceForResidency);
232233
auto commandStreamStart = commandStream.getUsed();
233234

234235
if (eventBuilder.getEvent() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
@@ -763,15 +764,16 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
763764
eventBuilder = &internalEventBuilder;
764765
DBG_LOG(EventsDebugEnable, "enqueueBlocked", "new virtualEvent", eventBuilder->getEvent());
765766
}
767+
auto outEvent = eventBuilder->getEvent();
766768

767769
//update queue taskCount
768-
taskCount = eventBuilder->getEvent()->getCompletionStamp();
770+
taskCount = outEvent->getCompletionStamp();
769771

770-
if (multiDispatchInfo.empty()) {
771-
DEBUG_BREAK_IF(!isCommandWithoutKernel(commandType));
772-
auto cmd = std::make_unique<CommandMarker>(*this);
772+
std::unique_ptr<Command> command;
773+
bool storeTimestampPackets = blockedCommandsData && timestampPacketContainer;
773774

774-
eventBuilder->getEvent()->setCommand(std::move(cmd));
775+
if (multiDispatchInfo.empty()) {
776+
command = std::make_unique<CommandMarker>(*this, blockedCommandsData);
775777
} else {
776778
//store task data in event
777779
std::vector<Surface *> allSurfaces;
@@ -788,28 +790,26 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
788790
allSurfaces.push_back(surface->duplicate());
789791
}
790792
PreemptionMode preemptionMode = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo);
791-
auto cmd = std::make_unique<CommandComputeKernel>(
792-
*this,
793-
std::move(blockedCommandsData),
794-
allSurfaces,
795-
shouldFlushDC(commandType, printfHandler.get()),
796-
slmUsed,
797-
commandType == CL_COMMAND_NDRANGE_KERNEL,
798-
std::move(printfHandler),
799-
preemptionMode,
800-
multiDispatchInfo.peekMainKernel(),
801-
(uint32_t)multiDispatchInfo.size());
802-
803-
if (timestampPacketContainer.get()) {
804-
for (cl_uint i = 0; i < eventsRequest.numEventsInWaitList; i++) {
805-
auto event = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
806-
event->incRefInternal();
807-
}
808-
cmd->setTimestampPacketNode(*timestampPacketContainer, *previousTimestampPacketNodes);
793+
command = std::make_unique<CommandComputeKernel>(*this,
794+
blockedCommandsData,
795+
allSurfaces,
796+
shouldFlushDC(commandType, printfHandler.get()),
797+
slmUsed,
798+
commandType == CL_COMMAND_NDRANGE_KERNEL,
799+
std::move(printfHandler),
800+
preemptionMode,
801+
multiDispatchInfo.peekMainKernel(),
802+
(uint32_t)multiDispatchInfo.size());
803+
}
804+
if (storeTimestampPackets) {
805+
for (cl_uint i = 0; i < eventsRequest.numEventsInWaitList; i++) {
806+
auto event = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
807+
event->incRefInternal();
809808
}
810-
cmd->setEventsRequest(eventsRequest);
811-
eventBuilder->getEvent()->setCommand(std::move(cmd));
809+
command->setTimestampPacketNode(*timestampPacketContainer, *previousTimestampPacketNodes);
810+
command->setEventsRequest(eventsRequest);
812811
}
812+
outEvent->setCommand(std::move(command));
813813

814814
eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventsRequest.eventWaitList, eventsRequest.numEventsInWaitList));
815815
eventBuilder->addParentEvent(this->virtualEvent);
@@ -819,7 +819,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
819819
this->virtualEvent->decRefInternal();
820820
}
821821

822-
this->virtualEvent = eventBuilder->getEvent();
822+
this->virtualEvent = outEvent;
823823
}
824824

825825
template <typename GfxFamily>

runtime/helpers/task_information.cpp

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
8282
return completionStamp;
8383
}
8484

85-
CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> kernelOperation, std::vector<Surface *> &surfaces,
85+
CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces,
8686
bool flushDC, bool usesSLM, bool ndRangeKernel, std::unique_ptr<PrintfHandler> printfHandler,
8787
PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount)
8888
: Command(commandQueue, kernelOperation), flushDC(flushDC), slmUsed(usesSLM),
@@ -135,12 +135,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
135135
if (printfHandler) {
136136
printfHandler.get()->makeResident(commandStreamReceiver);
137137
}
138-
if (currentTimestampPacketNodes) {
139-
currentTimestampPacketNodes->makeResident(commandStreamReceiver);
140-
}
141-
if (previousTimestampPacketNodes) {
142-
previousTimestampPacketNodes->makeResident(commandStreamReceiver);
143-
}
138+
makeTimestampPacketsResident();
144139

145140
if (executionModelKernel) {
146141
uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
@@ -224,9 +219,42 @@ CompletionStamp &CommandMarker::submit(uint32_t taskLevel, bool terminated) {
224219
}
225220

226221
auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
227-
completionStamp.taskCount = commandStreamReceiver.peekTaskCount();
228-
completionStamp.taskLevel = commandStreamReceiver.peekTaskLevel();
229-
completionStamp.flushStamp = commandStreamReceiver.obtainCurrentFlushStamp();
222+
223+
if (!kernelOperation) {
224+
completionStamp.taskCount = commandStreamReceiver.peekTaskCount();
225+
completionStamp.taskLevel = commandStreamReceiver.peekTaskLevel();
226+
completionStamp.flushStamp = commandStreamReceiver.obtainCurrentFlushStamp();
227+
228+
return completionStamp;
229+
}
230+
231+
auto lockCSR = commandStreamReceiver.obtainUniqueOwnership();
232+
233+
DispatchFlags dispatchFlags;
234+
dispatchFlags.blocking = true;
235+
dispatchFlags.lowPriority = commandQueue.getPriority() == QueuePriority::LOW;
236+
dispatchFlags.throttle = commandQueue.getThrottle();
237+
dispatchFlags.preemptionMode = commandQueue.getDevice().getPreemptionMode();
238+
dispatchFlags.multiEngineQueue = commandQueue.isMultiEngineQueue();
239+
dispatchFlags.guardCommandBufferWithPipeControl = true;
240+
dispatchFlags.outOfOrderExecutionAllowed = commandStreamReceiver.isNTo1SubmissionModelEnabled();
241+
242+
UNRECOVERABLE_IF(!commandStreamReceiver.peekTimestampPacketWriteEnabled());
243+
244+
dispatchFlags.csrDependencies.fillFromEventsRequestAndMakeResident(eventsRequest, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
245+
246+
makeTimestampPacketsResident();
247+
248+
gtpinNotifyPreFlushTask(&commandQueue);
249+
250+
completionStamp = commandStreamReceiver.flushTask(*kernelOperation->commandStream,
251+
0,
252+
commandQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
253+
commandQueue.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
254+
commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
255+
taskLevel,
256+
dispatchFlags,
257+
commandQueue.getDevice());
230258

231259
return completionStamp;
232260
}
@@ -259,6 +287,17 @@ Command::~Command() {
259287
}
260288
}
261289

290+
void Command::makeTimestampPacketsResident() {
291+
auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
292+
293+
if (currentTimestampPacketNodes) {
294+
currentTimestampPacketNodes->makeResident(commandStreamReceiver);
295+
}
296+
if (previousTimestampPacketNodes) {
297+
previousTimestampPacketNodes->makeResident(commandStreamReceiver);
298+
}
299+
}
300+
262301
Command::Command(CommandQueue &commandQueue) : commandQueue(commandQueue) {}
263302

264303
Command::Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation)

runtime/helpers/task_information.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "runtime/helpers/completion_stamp.h"
1111
#include "runtime/helpers/hw_info.h"
1212
#include "runtime/helpers/properties_helper.h"
13+
#include "runtime/helpers/timestamp_packet.h"
1314
#include "runtime/indirect_heap/indirect_heap.h"
1415
#include "runtime/utilities/iflist.h"
1516

@@ -92,6 +93,7 @@ class Command : public IFNode<Command> {
9293
}
9394
void setTimestampPacketNode(TimestampPacketContainer &current, TimestampPacketContainer &previous);
9495
void setEventsRequest(EventsRequest &eventsRequest);
96+
void makeTimestampPacketsResident();
9597

9698
TagNode<HwTimeStamps> *timestamp = nullptr;
9799
CompletionStamp completionStamp = {};
@@ -122,7 +124,7 @@ class CommandMapUnmap : public Command {
122124

123125
class CommandComputeKernel : public Command {
124126
public:
125-
CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> kernelResources, std::vector<Surface *> &surfaces,
127+
CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces,
126128
bool flushDC, bool usesSLM, bool ndRangeKernel, std::unique_ptr<PrintfHandler> printfHandler,
127129
PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount);
128130

@@ -146,7 +148,6 @@ class CommandComputeKernel : public Command {
146148
class CommandMarker : public Command {
147149
public:
148150
using Command::Command;
149-
150151
CompletionStamp &submit(uint32_t taskLevel, bool terminated) override;
151152
};
152153
} // namespace NEO

unit_tests/command_queue/dispatch_walker_tests.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,8 +745,10 @@ HWTEST_F(DispatchWalkerTest, givenBlockedEnqueueWhenObtainingCommandStreamThenAl
745745
auto expectedSizeCS = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;
746746

747747
CsrDependencies csrDependencies;
748+
EventsRequest eventsRequest(0, nullptr, nullptr);
748749
auto cmdStream = mockCmdQ.template obtainCommandStream<CL_COMMAND_NDRANGE_KERNEL>(csrDependencies, false, false, false, true,
749-
multiDispatchInfo, blockedKernelData, nullptr, 0u);
750+
multiDispatchInfo, eventsRequest, blockedKernelData,
751+
nullptr, 0u);
750752

751753
EXPECT_EQ(expectedSizeCS, cmdStream->getMaxAvailableSpace());
752754
EXPECT_EQ(expectedSizeCSAllocation, cmdStream->getGraphicsAllocation()->getUnderlyingBufferSize());

unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class MockCommandQueueWithCacheFlush : public MockCommandQueueHw<GfxFamily> {
2424
using MockCommandQueueHw<GfxFamily>::MockCommandQueueHw;
2525

2626
public:
27-
bool isCacheFlushCommand(uint32_t commandType) override {
27+
bool isCacheFlushCommand(uint32_t commandType) const override {
2828
return commandRequireCacheFlush;
2929
}
3030
bool commandRequireCacheFlush = false;

0 commit comments

Comments
 (0)