Skip to content

Commit bda408a

Browse files
JackAKirkjchlanda
andauthored
[UR][CUDA] Add opportunistic queue serialize prop, impl for cuda (#18443)
Makes short kernels that don't need to see the same global memory (or user guarantees global memory writes are complete) launch faster. See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization Makes lots of short kernels in cutlass great again. cc @FMarno who identified this performance gap. --------- Signed-off-by: JackAKirk <[email protected]> Co-authored-by: Jakub Chlanda <[email protected]>
1 parent fd866a3 commit bda408a

File tree

5 files changed

+38
-0
lines changed

5 files changed

+38
-0
lines changed

unified-runtime/include/ur_api.h

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

unified-runtime/include/ur_print.hpp

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

unified-runtime/scripts/core/exp-launch-properties.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ etors:
3636
desc: "work-group cluster dimensions"
3737
- name: WORK_GROUP_MEMORY
3838
desc: "Implicit work group memory allocation"
39+
- name: OPPORTUNISTIC_QUEUE_SERIALIZE
40+
desc: "Whether to opportunistically execute kernel launches serially on a native queue"
3941
--- #--------------------------------------------------------------------------
4042
type: union
4143
desc: "Specifies a launch property value"
@@ -56,6 +58,10 @@ members:
5658
name: workgroup_mem_size
5759
desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
5860
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
61+
- type: int
62+
name: opportunistic_queue_serialize
63+
desc: "[in] non-zero value indicates an opportunistic native queue serialized kernel"
64+
tag: $X_EXP_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE
5965
--- #--------------------------------------------------------------------------
6066
type: struct
6167
desc: "Kernel launch property"

unified-runtime/source/adapters/cuda/enqueue.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
560560
attr.value.cooperative = launchPropList[i].value.cooperative;
561561
break;
562562
}
563+
case UR_EXP_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE: {
564+
auto &attr = launch_attribute.emplace_back();
565+
attr.id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
566+
attr.value.programmaticStreamSerializationAllowed =
567+
launchPropList[i].value.opportunistic_queue_serialize;
568+
break;
569+
}
563570
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
564571
break;
565572
}

unified-runtime/test/conformance/exp_launch_properties/launch_properties.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,15 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
6666
props.push_back(coop_prop);
6767
}
6868

69+
if (compute_capability >= 9.0) {
70+
ur_exp_launch_property_t opportunistic_queue_serialize_prop;
71+
opportunistic_queue_serialize_prop.id =
72+
UR_EXP_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE;
73+
opportunistic_queue_serialize_prop.value.opportunistic_queue_serialize =
74+
1;
75+
props.push_back(opportunistic_queue_serialize_prop);
76+
}
77+
6978
ur_bool_t cluster_launch_supported = false;
7079
ASSERT_SUCCESS(
7180
urDeviceGetInfo(device, UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP,

0 commit comments

Comments
 (0)