From 0f5efc30399968a562880b671ffe2620c456449a Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Mon, 12 May 2025 19:02:05 -0700
Subject: [PATCH] [libspirv][ptx-nvidiacl] Change __clc__group_scratch size to
 32 x i128

To align with the comment in the file that specifies 32 storage
locations and 128 bits per warp.
---
 .../ptx-nvidiacl/group/collectives_helpers.ll | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/libclc/libspirv/lib/ptx-nvidiacl/group/collectives_helpers.ll b/libclc/libspirv/lib/ptx-nvidiacl/group/collectives_helpers.ll
index f2c59b3fddd31..1de105269a09b 100644
--- a/libclc/libspirv/lib/ptx-nvidiacl/group/collectives_helpers.ll
+++ b/libclc/libspirv/lib/ptx-nvidiacl/group/collectives_helpers.ll
@@ -2,60 +2,60 @@
 ; 128 bits per warp is sufficient for all fundamental data types and complex
 ; Reducing storage for small data types or increasing it for user-defined types
 ; will likely require an additional pass to track group algorithm usage
-@__clc__group_scratch = internal addrspace(3) global [128 x i64] undef, align 1
+@__clc__group_scratch = internal addrspace(3) global [32 x i128] undef, align 1
 
 define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
   ret i8 addrspace(3)* %cast
 }
 
 define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
   ret i8 addrspace(3)* %cast
 }
 
 define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)*
   ret i16 addrspace(3)* %cast
 }
 
 define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)*
   ret i32 addrspace(3)* %cast
 }
 
 define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)*
   ret i64 addrspace(3)* %cast
 }
 
 define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)*
   ret half addrspace(3)* %cast
 }
 
 define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)*
   ret float addrspace(3)* %cast
 }
 
 define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)*
   ret double addrspace(3)* %cast
 }
@@ -77,21 +77,21 @@ entry:
 
 define %complex_half addrspace(3)* @__clc__get_group_scratch_complex_half() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to %complex_half addrspace(3)*
   ret %complex_half addrspace(3)* %cast
 }
 
 define %complex_float addrspace(3)* @__clc__get_group_scratch_complex_float() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to %complex_float addrspace(3)*
   ret %complex_float addrspace(3)* %cast
 }
 
 define %complex_double addrspace(3)* @__clc__get_group_scratch_complex_double() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to %complex_double addrspace(3)*
   ret %complex_double addrspace(3)* %cast
 }