Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions libclc/libspirv/lib/ptx-nvidiacl/group/collectives_helpers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,60 +2,60 @@
; 128 bits per warp is sufficient for all fundamental data types and complex
; Reducing storage for small data types or increasing it for user-defined types
; will likely require an additional pass to track group algorithm usage
@__clc__group_scratch = internal addrspace(3) global [128 x i64] undef, align 1
@__clc__group_scratch = internal addrspace(3) global [32 x i128] undef, align 1

define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could/should probably rewrite this to use opaque pointers while we're here. If we do, almost all of this can go away. You could just return ptr addrspace(3) @__clc__group_scratch for every overload, I think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could/should probably rewrite this to use opaque pointers while we're here.

done

If we do, almost all of this can go away. You could just return ptr addrspace(3) @__clc__group_scratch for every overload, I think?

I added more global variables for different sizes. It should resolve the comment Reducing storage for small data types or increasing it for user-defined types will likely require an additional pass to track group algorithm usage on the top of the file.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I take it the scratch memory isn't mean to be shared between the different types? If so we couldn''t have separate globals in this way.

%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
ret i8 addrspace(3)* %cast
}

define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't switch to opaque pointers, this needs fixing. Having bitcast i64 is incorrect as %ptr would be i128 addrspace(3)*?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

ret i8 addrspace(3)* %cast
}

define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)*
ret i16 addrspace(3)* %cast
}

define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)*
ret i32 addrspace(3)* %cast
}

define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)*
ret i64 addrspace(3)* %cast
}

define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)*
ret half addrspace(3)* %cast
}

define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)*
ret float addrspace(3)* %cast
}

define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)*
ret double addrspace(3)* %cast
}
Expand All @@ -77,21 +77,21 @@ entry:

define %complex_half addrspace(3)* @__clc__get_group_scratch_complex_half() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to %complex_half addrspace(3)*
ret %complex_half addrspace(3)* %cast
}

define %complex_float addrspace(3)* @__clc__get_group_scratch_complex_float() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to %complex_float addrspace(3)*
ret %complex_float addrspace(3)* %cast
}

define %complex_double addrspace(3)* @__clc__get_group_scratch_complex_double() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%ptr = getelementptr inbounds [32 x i128], [32 x i128] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to %complex_double addrspace(3)*
ret %complex_double addrspace(3)* %cast
}
Loading