From 0f5efc30399968a562880b671ffe2620c456449a Mon Sep 17 00:00:00 2001 From: Wenju He Date: Mon, 12 May 2025 19:02:05 -0700 Subject: [PATCH] [libspirv][ptx-nvidiacl] Change __clc__group_scratch size to 32 x i128 To align with the comment in the file that specifies 32 storage locations and 128 bits per warp. --- .../ptx-nvidiacl/group/collectives_helpers.ll | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/libclc/libspirv/lib/ptx-nvidiacl/group/collectives_helpers.ll b/libclc/libspirv/lib/ptx-nvidiacl/group/collectives_helpers.ll index f2c59b3fddd31..1de105269a09b 100644 --- a/libclc/libspirv/lib/ptx-nvidiacl/group/collectives_helpers.ll +++ b/libclc/libspirv/lib/ptx-nvidiacl/group/collectives_helpers.ll @@ -2,60 +2,60 @@ ; 128 bits per warp is sufficient for all fundamental data types and complex ; Reducing storage for small data types or increasing it for user-defined types ; will likely require an additional pass to track group algorithm usage -@__clc__group_scratch = internal addrspace(3) global [128 x i64] undef, align 1 +@__clc__group_scratch = internal addrspace(3) global [32 x i128] undef, align 1 define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)* ret i8 addrspace(3)* %cast } define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)* ret i8 addrspace(3)* %cast } define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)* ret i16 addrspace(3)* %cast } define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)* ret i32 addrspace(3)* %cast } define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)* ret i64 addrspace(3)* %cast } define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)* ret half addrspace(3)* %cast } define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)* ret float addrspace(3)* %cast } define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)* ret double addrspace(3)* %cast } @@ -77,21 +77,21 @@ entry: define %complex_half addrspace(3)* @__clc__get_group_scratch_complex_half() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to %complex_half addrspace(3)* ret %complex_half addrspace(3)* %cast } define %complex_float addrspace(3)* @__clc__get_group_scratch_complex_float() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to %complex_float addrspace(3)* ret %complex_float addrspace(3)* %cast } define %complex_double addrspace(3)* @__clc__get_group_scratch_complex_double() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to %complex_double addrspace(3)* ret %complex_double addrspace(3)* %cast }