Skip to content

Commit 0289641

Browse files
add a limit for the number of operations in a batch
after reaching the threshold for the allowed number of operations enqueued in a single batch, enqueue the current batch for execution
1 parent eb590b2 commit 0289641

File tree

3 files changed

+106
-39
lines changed

3 files changed

+106
-39
lines changed

unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp

Lines changed: 49 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "../program.hpp"
2727
#include "../ur_interface_loader.hpp"
28+
#include "ur.hpp"
2829
#include "ur_api.h"
2930
#include "ze_api.h"
3031
#include <cstddef>
@@ -162,6 +163,19 @@ ur_queue_batched_t::onEventWaitListUse(ur_event_generation_t batch_generation) {
162163
}
163164
}
164165

166+
ur_result_t ur_queue_batched_t::markIssuedCommandInBatch(
167+
locked<batch_manager> &batchLocked) {
168+
if (batchLocked->isLimitOfEnqueuedCommandsReached()) {
169+
UR_CALL(queueFinishUnlocked(batchLocked));
170+
171+
batchLocked->setBatchEmpty();
172+
}
173+
174+
batchLocked->markNextIssuedCommand();
175+
176+
return UR_RESULT_SUCCESS;
177+
}
178+
165179
ur_result_t ur_queue_batched_t::enqueueKernelLaunch(
166180
ur_kernel_handle_t hKernel, uint32_t workDim,
167181
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -176,7 +190,7 @@ ur_result_t ur_queue_batched_t::enqueueKernelLaunch(
176190
TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueKernelLaunch");
177191
auto currentRegular = currentCmdLists.lock();
178192

179-
currentRegular->markIssuedCommand();
193+
markIssuedCommandInBatch(currentRegular);
180194

181195
UR_CALL(currentRegular->getActiveBatch().appendKernelLaunch(
182196
hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
@@ -272,7 +286,7 @@ ur_result_t ur_queue_batched_t::enqueueMemBufferRead(
272286

273287
auto lockedBatches = currentCmdLists.lock();
274288

275-
lockedBatches->markIssuedCommand();
289+
markIssuedCommandInBatch(lockedBatches);
276290

277291
UR_CALL(lockedBatches->getActiveBatch().appendMemBufferRead(
278292
hBuffer, false, offset, size, pDst, waitListView,
@@ -300,7 +314,7 @@ ur_result_t ur_queue_batched_t::enqueueMemBufferWrite(
300314

301315
auto lockedBatches = currentCmdLists.lock();
302316

303-
lockedBatches->markIssuedCommand();
317+
markIssuedCommandInBatch(lockedBatches);
304318

305319
UR_CALL(lockedBatches->getActiveBatch().appendMemBufferWrite(
306320
hBuffer, false, offset, size, pSrc, waitListView,
@@ -325,7 +339,7 @@ ur_result_t ur_queue_batched_t::enqueueDeviceGlobalVariableWrite(
325339

326340
auto lockedBatch = currentCmdLists.lock();
327341

328-
lockedBatch->markIssuedCommand();
342+
markIssuedCommandInBatch(lockedBatch);
329343

330344
UR_CALL(lockedBatch->getActiveBatch().appendDeviceGlobalVariableWrite(
331345
hProgram, name, false, count, offset, pSrc, waitListView,
@@ -347,7 +361,7 @@ ur_result_t ur_queue_batched_t::enqueueDeviceGlobalVariableRead(
347361

348362
auto lockedBatch = currentCmdLists.lock();
349363

350-
lockedBatch->markIssuedCommand();
364+
markIssuedCommandInBatch(lockedBatch);
351365

352366
UR_CALL(lockedBatch->getActiveBatch().appendDeviceGlobalVariableRead(
353367
hProgram, name, false, count, offset, pDst, waitListView,
@@ -371,7 +385,7 @@ ur_result_t ur_queue_batched_t::enqueueMemBufferFill(
371385

372386
auto lockedBatch = currentCmdLists.lock();
373387

374-
lockedBatch->markIssuedCommand();
388+
markIssuedCommandInBatch(lockedBatch);
375389

376390
return lockedBatch->getActiveBatch().appendMemBufferFill(
377391
hBuffer, pPattern, patternSize, offset, size, waitListView,
@@ -390,7 +404,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMMemcpy(
390404
wait_list_view(phEventWaitList, numEventsInWaitList, this);
391405
auto lockedBatch = currentCmdLists.lock();
392406

393-
lockedBatch->markIssuedCommand();
407+
markIssuedCommandInBatch(lockedBatch);
394408

395409
UR_CALL(lockedBatch->getActiveBatch().appendUSMMemcpy(
396410
false, pDst, pSrc, size, waitListView,
@@ -411,7 +425,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMFreeExp(
411425
wait_list_view(phEventWaitList, numEventsInWaitList, this);
412426
auto lockedBatch = currentCmdLists.lock();
413427

414-
lockedBatch->markIssuedCommand();
428+
markIssuedCommandInBatch(lockedBatch);
415429

416430
UR_CALL(lockedBatch->getActiveBatch().appendUSMFreeExp(
417431
this, pPool, pMem, waitListView,
@@ -431,7 +445,7 @@ ur_result_t ur_queue_batched_t::enqueueMemBufferMap(
431445
wait_list_view(phEventWaitList, numEventsInWaitList, this);
432446
auto lockedBatch = currentCmdLists.lock();
433447

434-
lockedBatch->markIssuedCommand();
448+
markIssuedCommandInBatch(lockedBatch);
435449

436450
UR_CALL(lockedBatch->getActiveBatch().appendMemBufferMap(
437451
hBuffer, false, mapFlags, offset, size, waitListView,
@@ -453,7 +467,7 @@ ur_result_t ur_queue_batched_t::enqueueMemUnmap(
453467
wait_list_view(phEventWaitList, numEventsInWaitList, this);
454468
auto lockedBatch = currentCmdLists.lock();
455469

456-
lockedBatch->markIssuedCommand();
470+
markIssuedCommandInBatch(lockedBatch);
457471

458472
return lockedBatch->getActiveBatch().appendMemUnmap(
459473
hMem, pMappedPtr, waitListView,
@@ -471,7 +485,7 @@ ur_result_t ur_queue_batched_t::enqueueMemBufferReadRect(
471485
wait_list_view(phEventWaitList, numEventsInWaitList, this);
472486
auto lockedBatch = currentCmdLists.lock();
473487

474-
lockedBatch->markIssuedCommand();
488+
markIssuedCommandInBatch(lockedBatch);
475489

476490
UR_CALL(lockedBatch->getActiveBatch().appendMemBufferReadRect(
477491
hBuffer, false, bufferOrigin, hostOrigin, region, bufferRowPitch,
@@ -497,7 +511,7 @@ ur_result_t ur_queue_batched_t::enqueueMemBufferWriteRect(
497511
wait_list_view(phEventWaitList, numEventsInWaitList, this);
498512
auto lockedBatch = currentCmdLists.lock();
499513

500-
lockedBatch->markIssuedCommand();
514+
markIssuedCommandInBatch(lockedBatch);
501515

502516
UR_CALL(lockedBatch->getActiveBatch().appendMemBufferWriteRect(
503517
hBuffer, false, bufferOrigin, hostOrigin, region, bufferRowPitch,
@@ -519,7 +533,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMAdvise(const void *pMem, size_t size,
519533

520534
auto lockedBatch = currentCmdLists.lock();
521535

522-
lockedBatch->markIssuedCommand();
536+
markIssuedCommandInBatch(lockedBatch);
523537

524538
return lockedBatch->getActiveBatch().appendUSMAdvise(
525539
pMem, size, advice, emptyWaitList,
@@ -535,7 +549,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMMemcpy2D(
535549
wait_list_view(phEventWaitList, numEventsInWaitList, this);
536550
auto lockedBatch = currentCmdLists.lock();
537551

538-
lockedBatch->markIssuedCommand();
552+
markIssuedCommandInBatch(lockedBatch);
539553

540554
UR_CALL(lockedBatch->getActiveBatch().appendUSMMemcpy2D(
541555
false, pDst, dstPitch, pSrc, srcPitch, width, height, waitListView,
@@ -557,7 +571,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMFill2D(
557571
wait_list_view(phEventWaitList, numEventsInWaitList, this);
558572
auto lockedBatch = currentCmdLists.lock();
559573

560-
lockedBatch->markIssuedCommand();
574+
markIssuedCommandInBatch(lockedBatch);
561575

562576
return lockedBatch->getActiveBatch().appendUSMFill2D(
563577
pMem, pitch, patternSize, pPattern, width, height, waitListView,
@@ -573,7 +587,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMPrefetch(
573587
wait_list_view(phEventWaitList, numEventsInWaitList, this);
574588
auto lockedBatch = currentCmdLists.lock();
575589

576-
lockedBatch->markIssuedCommand();
590+
markIssuedCommandInBatch(lockedBatch);
577591

578592
return lockedBatch->getActiveBatch().appendUSMPrefetch(
579593
pMem, size, flags, waitListView,
@@ -592,7 +606,7 @@ ur_result_t ur_queue_batched_t::enqueueMemBufferCopyRect(
592606
wait_list_view(phEventWaitList, numEventsInWaitList, this);
593607
auto lockedBatch = currentCmdLists.lock();
594608

595-
lockedBatch->markIssuedCommand();
609+
markIssuedCommandInBatch(lockedBatch);
596610

597611
return lockedBatch->getActiveBatch().appendMemBufferCopyRect(
598612
hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch,
@@ -608,7 +622,7 @@ ur_result_t ur_queue_batched_t::enqueueEventsWaitWithBarrier(
608622
wait_list_view(phEventWaitList, numEventsInWaitList, this);
609623
auto lockedBatch = currentCmdLists.lock();
610624

611-
lockedBatch->markIssuedCommand();
625+
markIssuedCommandInBatch(lockedBatch);
612626

613627
if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) {
614628
UR_CALL(lockedBatch->getActiveBatch().appendEventsWaitWithBarrier(
@@ -632,7 +646,7 @@ ur_queue_batched_t::enqueueEventsWait(uint32_t numEventsInWaitList,
632646

633647
auto lockedBatch = currentCmdLists.lock();
634648

635-
lockedBatch->markIssuedCommand();
649+
markIssuedCommandInBatch(lockedBatch);
636650

637651
UR_CALL(lockedBatch->getActiveBatch().appendEventsWait(
638652
waitListView, createEventIfRequestedRegular(
@@ -650,7 +664,7 @@ ur_result_t ur_queue_batched_t::enqueueMemBufferCopy(
650664

651665
auto lockedBatch = currentCmdLists.lock();
652666

653-
lockedBatch->markIssuedCommand();
667+
markIssuedCommandInBatch(lockedBatch);
654668

655669
return lockedBatch->getActiveBatch().appendMemBufferCopy(
656670
hBufferSrc, hBufferDst, srcOffset, dstOffset, size, waitListView,
@@ -667,7 +681,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMFill(
667681

668682
auto lockedBatch = currentCmdLists.lock();
669683

670-
lockedBatch->markIssuedCommand();
684+
markIssuedCommandInBatch(lockedBatch);
671685

672686
return lockedBatch->getActiveBatch().appendUSMFill(
673687
pMem, patternSize, pPattern, size, waitListView,
@@ -685,7 +699,7 @@ ur_result_t ur_queue_batched_t::enqueueMemImageRead(
685699

686700
auto lockedBatch = currentCmdLists.lock();
687701

688-
lockedBatch->markIssuedCommand();
702+
markIssuedCommandInBatch(lockedBatch);
689703

690704
UR_CALL(lockedBatch->getActiveBatch().appendMemImageRead(
691705
hImage, false, origin, region, rowPitch, slicePitch, pDst, waitListView,
@@ -709,7 +723,7 @@ ur_result_t ur_queue_batched_t::enqueueMemImageWrite(
709723

710724
auto lockedBatch = currentCmdLists.lock();
711725

712-
lockedBatch->markIssuedCommand();
726+
markIssuedCommandInBatch(lockedBatch);
713727

714728
UR_CALL(lockedBatch->getActiveBatch().appendMemImageWrite(
715729
hImage, false, origin, region, rowPitch, slicePitch, pSrc, waitListView,
@@ -732,7 +746,7 @@ ur_result_t ur_queue_batched_t::enqueueMemImageCopy(
732746

733747
auto lockedBatch = currentCmdLists.lock();
734748

735-
lockedBatch->markIssuedCommand();
749+
markIssuedCommandInBatch(lockedBatch);
736750

737751
return lockedBatch->getActiveBatch().appendMemImageCopy(
738752
hImageSrc, hImageDst, srcOrigin, dstOrigin, region, waitListView,
@@ -749,7 +763,7 @@ ur_result_t ur_queue_batched_t::enqueueReadHostPipe(
749763

750764
auto lockedBatch = currentCmdLists.lock();
751765

752-
lockedBatch->markIssuedCommand();
766+
markIssuedCommandInBatch(lockedBatch);
753767

754768
UR_CALL(lockedBatch->getActiveBatch().appendReadHostPipe(
755769
hProgram, pipe_symbol, false, pDst, size, waitListView,
@@ -772,7 +786,7 @@ ur_result_t ur_queue_batched_t::enqueueWriteHostPipe(
772786

773787
auto lockedBatch = currentCmdLists.lock();
774788

775-
lockedBatch->markIssuedCommand();
789+
markIssuedCommandInBatch(lockedBatch);
776790

777791
UR_CALL(lockedBatch->getActiveBatch().appendWriteHostPipe(
778792
hProgram, pipe_symbol, false, pSrc, size, waitListView,
@@ -796,7 +810,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMDeviceAllocExp(
796810

797811
auto lockedBatch = currentCmdLists.lock();
798812

799-
lockedBatch->markIssuedCommand();
813+
markIssuedCommandInBatch(lockedBatch);
800814

801815
UR_CALL(lockedBatch->getActiveBatch().appendUSMAllocHelper(
802816
this, pPool, size, pProperties, waitListView, ppMem,
@@ -818,7 +832,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMSharedAllocExp(
818832

819833
auto lockedBatch = currentCmdLists.lock();
820834

821-
lockedBatch->markIssuedCommand();
835+
markIssuedCommandInBatch(lockedBatch);
822836

823837
UR_CALL(lockedBatch->getActiveBatch().appendUSMAllocHelper(
824838
this, pPool, size, pProperties, waitListView, ppMem,
@@ -839,7 +853,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMHostAllocExp(
839853

840854
auto lockedBatch = currentCmdLists.lock();
841855

842-
lockedBatch->markIssuedCommand();
856+
markIssuedCommandInBatch(lockedBatch);
843857

844858
UR_CALL(lockedBatch->getActiveBatch().appendUSMAllocHelper(
845859
this, pPool, size, pProperties, waitListView, ppMem,
@@ -866,7 +880,7 @@ ur_result_t ur_queue_batched_t::bindlessImagesImageCopyExp(
866880

867881
auto lockedBatch = currentCmdLists.lock();
868882

869-
lockedBatch->markIssuedCommand();
883+
markIssuedCommandInBatch(lockedBatch);
870884

871885
return lockedBatch->getActiveBatch().bindlessImagesImageCopyExp(
872886
pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat,
@@ -885,7 +899,7 @@ ur_result_t ur_queue_batched_t::bindlessImagesWaitExternalSemaphoreExp(
885899

886900
auto lockedBatch = currentCmdLists.lock();
887901

888-
lockedBatch->markIssuedCommand();
902+
markIssuedCommandInBatch(lockedBatch);
889903

890904
return lockedBatch->getActiveBatch().bindlessImagesWaitExternalSemaphoreExp(
891905
hSemaphore, hasWaitValue, waitValue, waitListView,
@@ -902,7 +916,7 @@ ur_result_t ur_queue_batched_t::bindlessImagesSignalExternalSemaphoreExp(
902916

903917
auto lockedBatch = currentCmdLists.lock();
904918

905-
lockedBatch->markIssuedCommand();
919+
markIssuedCommandInBatch(lockedBatch);
906920

907921
return lockedBatch->getActiveBatch().bindlessImagesSignalExternalSemaphoreExp(
908922
hSemaphore, hasSignalValue, signalValue, waitListView,
@@ -934,7 +948,7 @@ ur_result_t ur_queue_batched_t::enqueueTimestampRecordingExp(
934948

935949
// auto lockedBatch = currentCmdLists.lock();
936950

937-
// lockedBatch->markIssuedCommand();
951+
// lockedBatch->markNextIssuedCommand();
938952

939953
// UR_CALL(lockedBatch->getActiveBatch().appendTimestampRecordingExp(
940954
// false, waitListView,
@@ -981,7 +995,7 @@ ur_result_t ur_queue_batched_t::enqueueNativeCommandExp(
981995

982996
auto lockedBatch = currentCmdLists.lock();
983997

984-
lockedBatch->markIssuedCommand();
998+
markIssuedCommandInBatch(lockedBatch);
985999

9861000
return lockedBatch->getActiveBatch().appendNativeCommandExp(
9871001
pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties,
@@ -1004,7 +1018,7 @@ ur_result_t ur_queue_batched_t::enqueueKernelLaunchWithArgsExp(
10041018

10051019
auto lockedBatch = currentCmdLists.lock();
10061020

1007-
lockedBatch->markIssuedCommand();
1021+
markIssuedCommandInBatch(lockedBatch);
10081022

10091023
return lockedBatch->getActiveBatch().appendKernelLaunchWithArgsExp(
10101024
hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,

unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ inline constexpr uint64_t initialSlotsForBatches = 10;
5555
// regularGenerationNumber below
5656
inline constexpr ur_event_generation_t initialGenerationNumber = 0;
5757

58+
// The limit of operations enqueued in the active batch (for definitions see the
59+
// comments below). If exceeded, the queue is flushed
60+
inline constexpr uint64_t maxNumberOfEnqueuedOperations = 120;
61+
5862
struct batch_manager {
5963
private:
6064
// The currently active regular command list, which may be replaced in the
@@ -84,7 +88,7 @@ struct batch_manager {
8488
// additional submission of the current batch is not needed.
8589
ur_event_generation_t regularGenerationNumber;
8690
// Whether any operation has been enqueued on the current batch
87-
bool isEmpty = true;
91+
uint64_t enqueuedOperationsCounter = 0;
8892

8993
public:
9094
batch_manager(ur_context_handle_t context, ur_device_handle_t device,
@@ -129,15 +133,19 @@ struct batch_manager {
129133
return activeBatch.getZeCommandList();
130134
}
131135

132-
bool isActiveBatchEmpty() { return isEmpty; }
136+
bool isActiveBatchEmpty() { return enqueuedOperationsCounter == 0; }
133137

134-
void markIssuedCommand() { isEmpty = false; }
138+
void markNextIssuedCommand() { enqueuedOperationsCounter++; }
135139

136-
void setBatchEmpty() { isEmpty = true; }
140+
void setBatchEmpty() { enqueuedOperationsCounter = 0; }
137141

138142
bool isLimitOfUsedCommandListsReached() {
139143
return initialSlotsForBatches <= runBatches.size();
140144
}
145+
146+
bool isLimitOfEnqueuedCommandsReached() {
147+
return maxNumberOfEnqueuedOperations <= enqueuedOperationsCounter;
148+
}
141149
};
142150

143151
struct ur_queue_batched_t : ur_object, ur_queue_t_ {
@@ -193,6 +201,8 @@ struct ur_queue_batched_t : ur_object, ur_queue_t_ {
193201

194202
ur_result_t queueFlushUnlocked(locked<batch_manager> &batchLocked);
195203

204+
ur_result_t markIssuedCommandInBatch(locked<batch_manager> &batchLocked);
205+
196206
public:
197207
ur_queue_batched_t(ur_context_handle_t, ur_device_handle_t, uint32_t ordinal,
198208
ze_command_queue_priority_t priority,

0 commit comments

Comments
 (0)