diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 6d74a7b2847f7..36f2093a44b7f 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1667,12 +1667,12 @@ def SYCLIntelESimdVectorize : InheritableAttr { } def SYCLScope : Attr { - // No spelling, as this attribute can't be created in the source code. - let Spellings = []; + let Spellings = [CXX11<"__sycl_detail__", "wg_scope">]; let Args = [EnumArgument<"level", "Level", /*is_string=*/false, ["work_group", "work_item"], - ["WorkGroup", "WorkItem"]>]; - let Subjects = SubjectList<[Function, Var]>; + ["WorkGroup", "WorkItem"], + /*optional=*/true>]; + let Subjects = SubjectList<[Function, Var, CXXRecord]>; let LangOpts = [SYCLIsDevice]; let AdditionalMembers = [{ @@ -1685,7 +1685,7 @@ def SYCLScope : Attr { } }]; - let Documentation = [InternalOnly]; + let Documentation = [SYCLWGScopeDocs]; } def SYCLDeviceIndirectlyCallable : InheritableAttr { diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 214329b7a4ded..7ad86502be22e 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -4097,6 +4097,18 @@ function pointer for the specified function. }]; } +def SYCLWGScopeDocs : Documentation { + let Category = DocCatFunction; + let Heading = "__sycl_detail__::wg_scope"; + let Content = [{ +This attribute can only be applied to records with a trivial default constructor and destructor. +Types with this attribute cannot be used for non-static data members. +It indicates that any block and namespace scope variable of a type holding this attribute +will be allocated in local memory. For variables allocated in block scope, they behave +as implicitly declared as static. + }]; +} + def SYCLDeviceDocs : Documentation { let Category = DocCatFunction; let Heading = "sycl_device"; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 55f0917a99ac4..817744cc23053 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12547,6 +12547,11 @@ def err_sycl_external_global : Error< def warn_sycl_kernel_too_big_args : Warning< "size of kernel arguments (%0 bytes) may exceed the supported maximum " "of %1 bytes on some devices">, InGroup, ShowInSystemHeader; +def err_sycl_wg_scope : Error< + "SYCL work group scope only applies to class with a trivial " + "%select{default constructor|destructor}0">; +def err_sycl_field_with_wg_scope : Error< + "non-static data member is of a type with a SYCL work group scope attribute applied to it">; def err_sycl_virtual_types : Error< "no class with a vtable can be used in a SYCL kernel or any code included in the kernel">; def note_sycl_recursive_function_declared_here: Note<"function implemented using recursion declared here">; diff --git a/clang/include/clang/Sema/SemaSYCL.h b/clang/include/clang/Sema/SemaSYCL.h index a79d9f55d7736..1efa9e634e317 100644 --- a/clang/include/clang/Sema/SemaSYCL.h +++ b/clang/include/clang/Sema/SemaSYCL.h @@ -267,6 +267,7 @@ class SemaSYCL : public SemaBase { void CheckSYCLKernelCall(FunctionDecl *CallerFunc, ArrayRef Args); + void CheckSYCLScopeAttr(CXXRecordDecl *Decl); /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current /// context is "used as device code". @@ -478,6 +479,7 @@ class SemaSYCL : public SemaBase { const ParsedAttr &AL); void handleSYCLIntelMaxWorkGroupsPerMultiprocessor(Decl *D, const ParsedAttr &AL); + void handleSYCLScopeAttr(Decl *D, const ParsedAttr &AL); void checkSYCLAddIRAttributesFunctionAttrConflicts(Decl *D); diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 8afe658c3e553..3e5f5005e734a 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -217,7 +217,11 @@ void CodeGenFunction::EmitVarDecl(const VarDecl &D) { if (D.getType().getAddressSpace() == LangAS::opencl_local) return CGM.getOpenCLRuntime().EmitWorkGroupLocalVarDecl(*this, D); - if (D.getAttr() && D.getAttr()->isWorkGroup()) + SYCLScopeAttr *ScopeAttr = D.getAttr(); + if (!ScopeAttr) + if (auto *RD = D.getType()->getAsCXXRecordDecl()) + ScopeAttr = RD->getAttr(); + if (ScopeAttr && ScopeAttr->isWorkGroup()) return CGM.getSYCLRuntime().emitWorkGroupLocalVarDecl(*this, D); assert(D.hasLocalStorage()); diff --git a/clang/lib/CodeGen/CGSYCLRuntime.cpp b/clang/lib/CodeGen/CGSYCLRuntime.cpp index bdad6867182f9..6b1abe409c1d1 100644 --- a/clang/lib/CodeGen/CGSYCLRuntime.cpp +++ b/clang/lib/CodeGen/CGSYCLRuntime.cpp @@ -96,7 +96,10 @@ void CGSYCLRuntime::emitWorkGroupLocalVarDecl(CodeGenFunction &CGF, const VarDecl &D) { #ifndef NDEBUG SYCLScopeAttr *Scope = D.getAttr(); - assert(Scope && Scope->isWorkGroup() && "work group scope expected"); + if (!Scope) + if (auto *RD = D.getType()->getAsCXXRecordDecl()) + Scope = RD->getAttr(); + assert((Scope && Scope->isWorkGroup()) && "work group scope expected"); #endif // NDEBUG // generate global variable in the address space selected by the clang CodeGen // (should be local) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index bbac92cc43b1e..1a662e0164dd5 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -5789,6 +5789,9 @@ LangAS CodeGenModule::GetGlobalVarAddressSpace(const VarDecl *D) { if (LangOpts.SYCLIsDevice && D) { auto *Scope = D->getAttr(); + if (!Scope) + if (auto *RD = D->getType()->getAsCXXRecordDecl()) + Scope = RD->getAttr(); if (Scope && Scope->isWorkGroup()) return LangAS::sycl_local; } diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 89029269d93c6..dae8000c91ff3 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -1849,7 +1849,9 @@ class DeferredDiagnosticsEmitter if (!S.SYCL().checkAllowedSYCLInitializer(VD) && !S.SYCL() .isTypeDecoratedWithDeclAttribute< - SYCLGlobalVariableAllowedAttr>(VD->getType())) { + SYCLGlobalVariableAllowedAttr>(VD->getType()) && + !S.SYCL().isTypeDecoratedWithDeclAttribute( + VD->getType())) { S.Diag(Loc, diag::err_sycl_restrict) << SemaSYCL::KernelConstStaticVariable; return; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 879428ce11b86..1572e76abcecf 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7870,6 +7870,8 @@ NamedDecl *Sema::ActOnVariableDeclarator( // attribute. if (SCSpec == DeclSpec::SCS_static && !R.isConstant(Context) && !SYCL().isTypeDecoratedWithDeclAttribute( + NewVD->getType()) && + !SYCL().isTypeDecoratedWithDeclAttribute( NewVD->getType())) SYCL().DiagIfDeviceCode(D.getIdentifierLoc(), diag::err_sycl_restrict) << SemaSYCL::KernelNonConstStaticDataVariable; @@ -18551,6 +18553,14 @@ FieldDecl *Sema::CheckFieldDecl(DeclarationName Name, QualType T, InvalidDecl = true; } + if (LangOpts.SYCLIsDevice) { + const CXXRecordDecl *RD = T->getAsCXXRecordDecl(); + if (RD && RD->hasAttr()) { + Diag(Loc, diag::err_sycl_field_with_wg_scope); + InvalidDecl = true; + } + } + if (LangOpts.OpenCL) { // OpenCL v1.2 s6.9b,r & OpenCL v2.0 s6.12.5 - The following types cannot be // used as structure or union field: image, sampler, event or block types. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 5da515598910c..02a0f306beb80 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -6705,6 +6705,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_SYCLDevice: S.SYCL().handleSYCLDeviceAttr(D, AL); break; + case ParsedAttr::AT_SYCLScope: + S.SYCL().handleSYCLScopeAttr(D, AL); + break; case ParsedAttr::AT_SYCLDeviceIndirectlyCallable: S.SYCL().handleSYCLDeviceIndirectlyCallableAttr(D, AL); break; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index ceea4ac99d236..8994c87baf2fd 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -7218,6 +7218,9 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) { else if (Record->hasAttr()) checkCUDADeviceBuiltinTextureClassTemplate(*this, Record); } + if (getLangOpts().SYCLIsDevice && Record->hasAttr()) { + SYCL().CheckSYCLScopeAttr(Record); + } } /// Look up the special member function that would be called by a special diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 849e14db9c217..6e31c15331d59 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -235,7 +235,9 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef Locs, VD->getStorageClass() == SC_Static && !VD->hasAttr() && !SemaSYCL::isTypeDecoratedWithDeclAttribute< - SYCLGlobalVariableAllowedAttr>(VD->getType())) + SYCLGlobalVariableAllowedAttr>(VD->getType()) && + !SemaSYCL::isTypeDecoratedWithDeclAttribute( + VD->getType())) SYCL().DiagIfDeviceCode(*Locs.begin(), diag::err_sycl_restrict) << SemaSYCL::KernelNonConstStaticDataVariable; // Non-const globals are not allowed in SYCL except for ESIMD or with the @@ -243,7 +245,9 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef Locs, else if (IsRuntimeEvaluated && !IsEsimdPrivateGlobal && !IsConst && VD->hasGlobalStorage() && !VD->hasAttr() && !SemaSYCL::isTypeDecoratedWithDeclAttribute< - SYCLGlobalVariableAllowedAttr>(VD->getType())) + SYCLGlobalVariableAllowedAttr>(VD->getType()) && + !SemaSYCL::isTypeDecoratedWithDeclAttribute( + VD->getType())) SYCL().DiagIfDeviceCode(*Locs.begin(), diag::err_sycl_restrict) << SemaSYCL::KernelGlobalVariable; // ESIMD globals cannot be used in a SYCL context. diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 767dde6512b83..1db4ce7e807f9 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -5104,6 +5104,28 @@ void SemaSYCL::CheckSYCLKernelCall(FunctionDecl *KernelFunc, KernelFunc->setInvalidDecl(); } +void SemaSYCL::CheckSYCLScopeAttr(CXXRecordDecl *Decl) { + assert(Decl->hasAttr()); + + bool HasError = false; + + if (Decl->isDependentContext()) + return; + + // We don't emit both diags at the time as note will only be emitted for the + // first, which is confusing. So we check both cases but only report one. + if (!Decl->hasTrivialDefaultConstructor()) { + Diag(Decl->getLocation(), diag::err_sycl_wg_scope) << 0; + HasError = true; + } else if (!Decl->hasTrivialDestructor()) { + Diag(Decl->getLocation(), diag::err_sycl_wg_scope) << 1; + HasError = true; + } + + if (HasError) + Decl->dropAttr(); +} + // For a wrapped parallel_for, copy attributes from original // kernel to wrapped kernel. void SemaSYCL::copySYCLKernelAttrs(CXXMethodDecl *CallOperator) { diff --git a/clang/lib/Sema/SemaSYCLDeclAttr.cpp b/clang/lib/Sema/SemaSYCLDeclAttr.cpp index db0a2fdce4aad..c2268d9f35b14 100644 --- a/clang/lib/Sema/SemaSYCLDeclAttr.cpp +++ b/clang/lib/Sema/SemaSYCLDeclAttr.cpp @@ -3109,6 +3109,21 @@ void SemaSYCL::handleSYCLRegisterNumAttr(Decl *D, const ParsedAttr &AL) { D->addAttr(::new (Context) SYCLRegisterNumAttr(Context, AL, RegNo)); } +void SemaSYCL::handleSYCLScopeAttr(Decl *D, const ParsedAttr &AL) { + if (!AL.checkExactlyNumArgs(SemaRef, 0)) + return; + if (auto *CRD = dyn_cast(D); + !CRD || !(CRD->isClass() || CRD->isStruct())) { + SemaRef.Diag(AL.getRange().getBegin(), + diag::err_attribute_wrong_decl_type_str) + << AL << AL.isRegularKeywordAttribute() << "classes"; + return; + } + + D->addAttr(SYCLScopeAttr::Create(SemaRef.getASTContext(), + SYCLScopeAttr::Level::WorkGroup, AL)); +} + void SemaSYCL::checkSYCLAddIRAttributesFunctionAttrConflicts(Decl *D) { const auto *AddIRFuncAttr = D->getAttr(); diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index caf210e5c1dab..b0a99c82110e9 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -203,6 +203,7 @@ // CHECK-NEXT: SYCLIntelSchedulerTargetFmaxMhz (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelUseStallEnableClusters (SubjectMatchRule_function) // CHECK-NEXT: SYCLRegisterNum (SubjectMatchRule_variable_is_global) +// CHECK-NEXT: SYCLScope (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record) // CHECK-NEXT: SYCLSimd (SubjectMatchRule_function, SubjectMatchRule_variable_is_global) // CHECK-NEXT: SYCLSpecialClass (SubjectMatchRule_record) // CHECK-NEXT: SYCLType (SubjectMatchRule_record, SubjectMatchRule_enum) diff --git a/clang/test/SemaSYCL/sycl_wg_scope.cpp b/clang/test/SemaSYCL/sycl_wg_scope.cpp new file mode 100644 index 0000000000000..51f9616d47528 --- /dev/null +++ b/clang/test/SemaSYCL/sycl_wg_scope.cpp @@ -0,0 +1,94 @@ +// Verify the use of wg_scope is correctly diagnosed. +// RUN: %clang_cc1 -fsycl-is-device -verify %s + +class [[__sycl_detail__::wg_scope]] G1 {}; +class [[__sycl_detail__::wg_scope]] G2 { + G2() = default; + G2(int i) : i(i) {} + int i; +}; + +class [[__sycl_detail__::wg_scope]] G3 { + ~G3() = default; +}; + +class [[__sycl_detail__::wg_scope]] B4 { // expected-error {{SYCL work group scope only applies to class with a trivial default constructor}} + B4() {} +}; + +class [[__sycl_detail__::wg_scope]] B5 { // expected-error {{SYCL work group scope only applies to class with a trivial destructor}} + ~B5() {} +}; + +class [[__sycl_detail__::wg_scope]] B6 { // expected-error {{SYCL work group scope only applies to class with a trivial default constructor}} + B6() {} + ~B6() {} +}; + +template class [[__sycl_detail__::wg_scope]] B7 { // #B7 +public: + T obj; +}; + +struct Valid {}; +struct InvalidCtor { + InvalidCtor() {} +}; +struct InvalidDtor { + ~InvalidDtor() {} +}; +struct InvalidCDtor { + InvalidCDtor() {} + ~InvalidCDtor() {} +}; + +B7 b7; +// expected-error@#B7 {{SYCL work group scope only applies to class with a trivial default constructor}} +// expected-note@+1 {{in instantiation of template class 'B7' requested here}} +B7 b9; +// expected-error@#B7 {{SYCL work group scope only applies to class with a trivial destructor}} +// expected-note@+1 {{in instantiation of template class 'B7' requested here}} +B7 b10; +// expected-error@#B7 {{SYCL work group scope only applies to class with a trivial default constructor}} +// expected-note@+1 {{in instantiation of template class 'B7' requested here}} +B7 b11; + +template class [[__sycl_detail__::wg_scope]] B12 { // #B12 +public: + B12() = default; + ~B12() = default; + T obj; +}; + +B12 b12; +// expected-error@#B12 {{SYCL work group scope only applies to class with a trivial default constructor}} +// expected-note@+1 {{in instantiation of template class 'B12' requested here}} +B12 b13; + +class B14 { + G1 field; // expected-error {{non-static data member is of a type with a SYCL work group scope attribute applied to it}} +}; + +template class B15 { + T field; // #B15-field +}; + +// expected-error@#B15-field {{non-static data member is of a type with a SYCL work group scope attribute applied to it}} +// expected-note@+1 {{in instantiation of template class 'B15' requested here}} +B15 b15; + +G1 g16; +static G1 g17; + +struct Wrap { + static G1 g18; +}; + +__attribute__((sycl_device)) void ref_func() { + G1 g19; + static G1 g20; + + (void)g16; + (void)g17; + (void)Wrap::g18; +} diff --git a/llvm/include/llvm/SYCLLowerIR/LowerWGLocalMemory.h b/llvm/include/llvm/SYCLLowerIR/LowerWGLocalMemory.h index cdb8e6dff0218..62befadd417cf 100644 --- a/llvm/include/llvm/SYCLLowerIR/LowerWGLocalMemory.h +++ b/llvm/include/llvm/SYCLLowerIR/LowerWGLocalMemory.h @@ -47,6 +47,11 @@ class SYCLLowerWGLocalMemoryPass ModulePass *createSYCLLowerWGLocalMemoryLegacyPass(); void initializeSYCLLowerWGLocalMemoryLegacyPass(PassRegistry &); +namespace sycl { +std::vector> +getKernelNamesUsingImplicitLocalMem(const Module &M); +} + } // namespace llvm #endif // LLVM_SYCLLOWERIR_LOWERWGLOCALMEMORY_H diff --git a/llvm/include/llvm/Support/PropertySetIO.h b/llvm/include/llvm/Support/PropertySetIO.h index 13cb687f3b08b..201a93d19e7bc 100644 --- a/llvm/include/llvm/Support/PropertySetIO.h +++ b/llvm/include/llvm/Support/PropertySetIO.h @@ -210,6 +210,7 @@ class PropertySetRegistry { static constexpr char SYCL_DEVICE_REQUIREMENTS[] = "SYCL/device requirements"; static constexpr char SYCL_HOST_PIPES[] = "SYCL/host pipes"; static constexpr char SYCL_VIRTUAL_FUNCTIONS[] = "SYCL/virtual functions"; + static constexpr char SYCL_IMPLICIT_LOCAL_ARG[] = "SYCL/implicit local arg"; /// Function for bulk addition of an entire property set in the given /// \p Category . diff --git a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp index cc287b9101fa8..fd79ac3cd9a31 100644 --- a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp +++ b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp @@ -16,6 +16,7 @@ #include "llvm/SYCLLowerIR/CompileTimePropertiesPass.h" #include "llvm/SYCLLowerIR/DeviceGlobals.h" #include "llvm/SYCLLowerIR/HostPipes.h" +#include "llvm/SYCLLowerIR/LowerWGLocalMemory.h" #include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/SYCLLowerIR/SYCLDeviceLibReqMask.h" #include "llvm/SYCLLowerIR/SYCLKernelParamOptInfo.h" @@ -397,6 +398,13 @@ PropSetRegTy computeModuleProperties(const Module &M, for (const StringRef &FName : FuncNames) PropSet.add(PropSetRegTy::SYCL_ASSERT_USED, FName, true); } + { + std::vector> ArgPos = + getKernelNamesUsingImplicitLocalMem(M); + for (const auto &FuncAndArgPos : ArgPos) + PropSet.add(PropSetRegTy::SYCL_IMPLICIT_LOCAL_ARG, FuncAndArgPos.first, + FuncAndArgPos.second); + } { if (isModuleUsingAsan(M)) diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index 1ca82ae078df0..bb17dc3bc69cd 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -13,13 +13,20 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/Pass.h" +#include "llvm/TargetParser/Triple.h" using namespace llvm; #define DEBUG_TYPE "LowerWGLocalMemory" static constexpr char SYCL_ALLOCLOCALMEM_CALL[] = "__sycl_allocateLocalMemory"; +static constexpr char SYCL_DYNAMIC_LOCALMEM_CALL[] = + "__sycl_dynamicLocalMemoryPlaceholder"; static constexpr char LOCALMEMORY_GV_PREF[] = "WGLocalMem"; +static constexpr char DYNAMIC_LOCALMEM_GV[] = + "__sycl_dynamicLocalMemoryPlaceholder_GV"; +static constexpr char WORK_GROUP_STATIC_ATTR[] = "sycl-work-group-static"; +static constexpr char WORK_GROUP_STATIC_ARG_ATTR[] = "sycl-implicit-local-arg"; namespace { class SYCLLowerWGLocalMemoryLegacy : public ModulePass { @@ -42,6 +49,31 @@ class SYCLLowerWGLocalMemoryLegacy : public ModulePass { }; } // namespace +std::vector> +sycl::getKernelNamesUsingImplicitLocalMem(const Module &M) { + std::vector> SPIRKernelNames; + Triple TT(M.getTargetTriple()); + + if (TT.isSPIROrSPIRV()) { + auto GetArgumentPos = [&](const Function &F) -> int { + for (const Argument &Arg : F.args()) + if (F.getAttributes().hasParamAttr(Arg.getArgNo(), + WORK_GROUP_STATIC_ARG_ATTR)) + return Arg.getArgNo(); + // Not lowered to an implicit arg or DAE. + return -1; + }; + llvm::for_each(M.functions(), [&](const Function &F) { + if (F.getCallingConv() == CallingConv::SPIR_KERNEL && + F.hasFnAttribute(WORK_GROUP_STATIC_ATTR)) { + int ArgPos = GetArgumentPos(F); + SPIRKernelNames.emplace_back(F.getName(), ArgPos); + } + }); + } + return SPIRKernelNames; +} + char SYCLLowerWGLocalMemoryLegacy::ID = 0; INITIALIZE_PASS(SYCLLowerWGLocalMemoryLegacy, "sycllowerwglocalmemory", "Replace __sycl_allocateLocalMemory with allocation of memory " @@ -90,17 +122,27 @@ static void lowerAllocaLocalMemCall(CallInst *CI, Module &M) { CI->replaceAllUsesWith(GVPtr); } -static bool allocaWGLocalMemory(Module &M) { - Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL); - if (!ALMFunc) - return false; +static void +lowerDynamicLocalMemCallDirect(CallInst *CI, Triple TT, + GlobalVariable *LocalMemPlaceholder) { + assert(CI); - assert(ALMFunc->isDeclaration() && "should have declaration only"); + Value *GVPtr = [&]() -> Value * { + IRBuilder<> Builder(CI); + if (TT.isSPIROrSPIRV()) + return Builder.CreateLoad(CI->getType(), LocalMemPlaceholder); + return Builder.CreatePointerCast(LocalMemPlaceholder, CI->getType()); + }(); + CI->replaceAllUsesWith(GVPtr); +} + +static void lowerLocalMemCall(Function *LocalMemAllocFunc, + std::function TransformCall) { SmallVector DelCalls; - for (User *U : ALMFunc->users()) { + for (User *U : LocalMemAllocFunc->users()) { auto *CI = cast(U); - lowerAllocaLocalMemCall(CI, M); + TransformCall(CI); DelCalls.push_back(CI); } @@ -110,15 +152,173 @@ static bool allocaWGLocalMemory(Module &M) { } // Remove __sycl_allocateLocalMemory declaration. - assert(ALMFunc->use_empty() && "__sycl_allocateLocalMemory is still in use"); - ALMFunc->eraseFromParent(); + assert(LocalMemAllocFunc->use_empty() && + "local mem allocation function is still in use"); + LocalMemAllocFunc->eraseFromParent(); +} + +static bool allocaWGLocalMemory(Module &M) { + Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL); + if (!ALMFunc) + return false; + + assert(ALMFunc->isDeclaration() && "should have declaration only"); + + lowerLocalMemCall(ALMFunc, + [&](CallInst *CI) { lowerAllocaLocalMemCall(CI, M); }); + + return true; +} + +// For dynamic memory we have 2 case: +// - Direct for CUDA/HIP: we create a placeholder and set the memory on +// launch. +// - Indirect for OpenCL/Level0: we create a shared value holding the pointer +// to the buffer passed as argument. +static bool dynamicWGLocalMemory(Module &M) { + Function *DLMFunc = M.getFunction(SYCL_DYNAMIC_LOCALMEM_CALL); + if (!DLMFunc) + return false; + + GlobalVariable *LocalMemArrayGV = + M.getGlobalVariable(DYNAMIC_LOCALMEM_GV, true); + Triple TT(M.getTargetTriple()); + unsigned LocalAS = DLMFunc->getReturnType()->getPointerAddressSpace(); + + if (!LocalMemArrayGV) { + assert(DLMFunc->isDeclaration() && "should have declaration only"); + Type *LocalMemArrayTy = + TT.isSPIROrSPIRV() + ? static_cast(PointerType::get(M.getContext(), LocalAS)) + : static_cast( + ArrayType::get(Type::getInt8Ty(M.getContext()), 0)); + LocalMemArrayGV = new GlobalVariable( + M, // module + LocalMemArrayTy, // type + false, // isConstant + TT.isSPIROrSPIRV() ? GlobalValue::LinkOnceODRLinkage + : GlobalValue::ExternalLinkage, // Linkage + TT.isSPIROrSPIRV() ? UndefValue::get(LocalMemArrayTy) + : nullptr, // Initializer + DYNAMIC_LOCALMEM_GV, // Name prefix + nullptr, // InsertBefore + GlobalVariable::NotThreadLocal, // ThreadLocalMode + LocalAS // AddressSpace + ); + constexpr int DefaultMaxAlignment = 128; + if (!TT.isSPIROrSPIRV()) + LocalMemArrayGV->setAlignment(Align{DefaultMaxAlignment}); + } + lowerLocalMemCall(DLMFunc, [&](CallInst *CI) { + lowerDynamicLocalMemCallDirect(CI, TT, LocalMemArrayGV); + }); + if (TT.isSPIROrSPIRV()) { + SmallVector Kernels; + llvm::for_each(M.functions(), [&](Function &F) { + if (F.getCallingConv() == CallingConv::SPIR_KERNEL && + F.hasFnAttribute(WORK_GROUP_STATIC_ATTR)) { + Kernels.push_back(&F); + } + }); + for (Function *OldKernel : Kernels) { + FunctionType *FuncTy = OldKernel->getFunctionType(); + const AttributeList &FuncAttrs = OldKernel->getAttributes(); + Type *ImplicitLocalPtr = PointerType::get(M.getContext(), LocalAS); + + // Construct an argument list containing all of the previous arguments. + SmallVector Arguments; + SmallVector ArgumentAttributes; + for (const auto &I : enumerate(OldKernel->args())) { + Arguments.push_back(I.value().getType()); + ArgumentAttributes.push_back(FuncAttrs.getParamAttrs(I.index())); + } + + Arguments.push_back(ImplicitLocalPtr); + ArgumentAttributes.push_back(AttributeSet::get( + M.getContext(), + ArrayRef{ + Attribute::get(M.getContext(), Attribute::NoAlias), + Attribute::get(M.getContext(), WORK_GROUP_STATIC_ARG_ATTR)})); + + // Build the new function. + AttributeList NAttrs = + AttributeList::get(OldKernel->getContext(), FuncAttrs.getFnAttrs(), + FuncAttrs.getRetAttrs(), ArgumentAttributes); + assert(!FuncTy->isVarArg() && "Variadic arguments prohibited in SYCL"); + FunctionType *NewFuncTy = FunctionType::get( + FuncTy->getReturnType(), Arguments, FuncTy->isVarArg()); + + Function *NewFunc = Function::Create(NewFuncTy, OldKernel->getLinkage(), + OldKernel->getAddressSpace()); + + // Keep original function ordering. + M.getFunctionList().insertAfter(OldKernel->getIterator(), NewFunc); + + NewFunc->copyAttributesFrom(OldKernel); + NewFunc->setComdat(OldKernel->getComdat()); + NewFunc->setAttributes(NAttrs); + NewFunc->takeName(OldKernel); + + // Splice the body of the old function right into the new function. + NewFunc->splice(NewFunc->begin(), OldKernel); + + for (Function::arg_iterator FuncArg = OldKernel->arg_begin(), + FuncEnd = OldKernel->arg_end(), + NewFuncArg = NewFunc->arg_begin(); + FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) { + FuncArg->replaceAllUsesWith(NewFuncArg); + } + + // Clone metadata of the old function, including debug info descriptor. + SmallVector, 1> MDs; + OldKernel->getAllMetadata(MDs); + for (const auto &MD : MDs) + NewFunc->addMetadata(MD.first, *MD.second); + // Store the pointer to the implicit local memory into the global + // handler. + IRBuilder<> Builder(&NewFunc->getEntryBlock(), + NewFunc->getEntryBlock().getFirstNonPHIIt()); + Builder.CreateStore(NewFunc->getArg(NewFunc->arg_size() - 1), + LocalMemArrayGV); + OldKernel->eraseFromParent(); + auto FixupMetadata = [&](StringRef MDName, Metadata *NewV) { + auto *Node = NewFunc->getMetadata(MDName); + if (!Node) + return; + SmallVector NewMD(Node->operands()); + NewMD.emplace_back(NewV); + NewFunc->setMetadata(MDName, + llvm::MDNode::get(NewFunc->getContext(), NewMD)); + }; + + FixupMetadata("kernel_arg_buffer_location", + ConstantAsMetadata::get(Builder.getInt32(-1))); + FixupMetadata("kernel_arg_runtime_aligned", + ConstantAsMetadata::get(Builder.getFalse())); + FixupMetadata("kernel_arg_exclusive_ptr", + ConstantAsMetadata::get(Builder.getFalse())); + + FixupMetadata("kernel_arg_addr_space", + ConstantAsMetadata::get(Builder.getInt32(LocalAS))); + FixupMetadata("kernel_arg_access_qual", + MDString::get(M.getContext(), "read_write")); + FixupMetadata("kernel_arg_type", MDString::get(M.getContext(), "void*")); + FixupMetadata("kernel_arg_base_type", + MDString::get(M.getContext(), "void*")); + FixupMetadata("kernel_arg_type_qual", MDString::get(M.getContext(), "")); + FixupMetadata("kernel_arg_accessor_ptr", + ConstantAsMetadata::get(Builder.getFalse())); + } + } return true; } PreservedAnalyses SYCLLowerWGLocalMemoryPass::run(Module &M, ModuleAnalysisManager &) { - if (allocaWGLocalMemory(M)) + bool MadeChanges = allocaWGLocalMemory(M); + MadeChanges = dynamicWGLocalMemory(M) || MadeChanges; + if (MadeChanges) return PreservedAnalyses::none(); return PreservedAnalyses::all(); } diff --git a/llvm/lib/Support/PropertySetIO.cpp b/llvm/lib/Support/PropertySetIO.cpp index 2fe7cac00fb14..ca8b014c97a0e 100644 --- a/llvm/lib/Support/PropertySetIO.cpp +++ b/llvm/lib/Support/PropertySetIO.cpp @@ -207,6 +207,7 @@ constexpr char PropertySetRegistry::SYCL_DEVICE_GLOBALS[]; constexpr char PropertySetRegistry::SYCL_DEVICE_REQUIREMENTS[]; constexpr char PropertySetRegistry::SYCL_HOST_PIPES[]; constexpr char PropertySetRegistry::SYCL_VIRTUAL_FUNCTIONS[]; +constexpr char PropertySetRegistry::SYCL_IMPLICIT_LOCAL_ARG[]; } // namespace util } // namespace llvm diff --git a/llvm/test/SYCLLowerIR/work_group_static.ll b/llvm/test/SYCLLowerIR/work_group_static.ll new file mode 100644 index 0000000000000..105bb270f3450 --- /dev/null +++ b/llvm/test/SYCLLowerIR/work_group_static.ll @@ -0,0 +1,42 @@ +; RUN: opt -S -sycllowerwglocalmemory -bugpoint-enable-legacy-pm < %s | FileCheck %s +; RUN: opt -S -passes=sycllowerwglocalmemory < %s | FileCheck %s + +; CHECK-NOT: __sycl_dynamicLocalMemoryPlaceholder + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +; CHECK: @__sycl_dynamicLocalMemoryPlaceholder_GV = linkonce_odr addrspace(3) global ptr addrspace(3) undef + +; Function Attrs: convergent norecurse +; CHECK: @_ZTS7KernelA(ptr addrspace(1) %0, ptr addrspace(3) noalias "sycl-implicit-local-arg" %[[IMPLICT_ARG:[a-zA-Z0-9]+]]{{.*}} !kernel_arg_addr_space ![[ADDR_SPACE_MD:[0-9]+]] +define weak_odr dso_local spir_kernel void @_ZTS7KernelA(ptr addrspace(1) %0) local_unnamed_addr #0 !kernel_arg_addr_space !5 { +entry: +; CHECK: store ptr addrspace(3) %[[IMPLICT_ARG]], ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV +; CHECK: %[[LD1:[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV + %1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1 +; CHECK: getelementptr inbounds i8, ptr addrspace(3) %[[LD1]] + %2 = getelementptr inbounds i8, ptr addrspace(3) %1, i64 4 +; CHECK: %[[LD2:[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV + %3 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 4) #1 + ret void +} + +; Function Attrs: convergent +declare dso_local spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64) local_unnamed_addr #1 + +attributes #0 = { convergent norecurse "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" "sycl-work-group-static"="1" } +attributes #1 = { convergent norecurse } + +!llvm.module.flags = !{!0} +!opencl.spir.version = !{!1} +!spirv.Source = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{i32 4, i32 100000} +!3 = !{!"clang version 13.0.0"} +!4 = !{} +; ![[ADDR_SPACE_MD]] = !{i32 1, i32 3} +!5 = !{i32 1} diff --git a/llvm/test/SYCLLowerIR/work_group_static_nv.ll b/llvm/test/SYCLLowerIR/work_group_static_nv.ll new file mode 100644 index 0000000000000..cc957e45ea0a8 --- /dev/null +++ b/llvm/test/SYCLLowerIR/work_group_static_nv.ll @@ -0,0 +1,38 @@ +; RUN: opt -S -sycllowerwglocalmemory -bugpoint-enable-legacy-pm < %s | FileCheck %s +; RUN: opt -S -passes=sycllowerwglocalmemory < %s | FileCheck %s + +; CHECK-NOT: __sycl_dynamicLocalMemoryPlaceholder + +target triple = "nvptx64-nvidia-cuda" + +; CHECK: @__sycl_dynamicLocalMemoryPlaceholder_GV = external addrspace(3) global [0 x i8], align 128 + +; Function Attrs: convergent norecurse +; CHECK: @_ZTS7KernelA(ptr addrspace(1) %0) +define void @_ZTS7KernelA(ptr addrspace(1) %0) local_unnamed_addr #0 !kernel_arg_addr_space !5 { +entry: +; CHECK: getelementptr inbounds i8, ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV + %1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1 + %2 = getelementptr inbounds i8, ptr addrspace(3) %1, i64 4 + %3 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 4) #1 + ret void +} + +; Function Attrs: convergent +declare dso_local spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64) local_unnamed_addr #1 + +attributes #0 = { convergent norecurse "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" "sycl-work-group-static"="1" } +attributes #1 = { convergent norecurse } + +!llvm.module.flags = !{!0} +!opencl.spir.version = !{!1} +!spirv.Source = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{i32 4, i32 100000} +!3 = !{!"clang version 13.0.0"} +!4 = !{} +; ![[ADDR_SPACE_MD]] = !{i32 1, i32 3} +!5 = !{i32 1} diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 72841724fa01d..60a97b0eba8e3 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/Naghasan/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 3744a4e87ad76..3d4aa856fdbac 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -4,4 +4,4 @@ # Date: Tue Nov 19 10:24:08 2024 +0000 # Merge pull request #1584 from zhaomaosu/simplify-device-global # [DeviceSanitizer] Remove device global "__AsanDeviceGlobalCount" -set(UNIFIED_RUNTIME_TAG 0ea47d7c70b9a21a3d90612a0a0e7525034e62f7) +set(UNIFIED_RUNTIME_TAG bc9eef955058db60a26fe1f30cd83939122c7c01) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 5dff0396f07fb..8de3454b4b207 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -1979,6 +1979,16 @@ can be used adding nodes to a graph when creating a graph from queue recording. New methods are also defined that enable submitting an executable graph, e.g. directly to a queue without returning an event. +==== sycl_ext_oneapi_work_group_scratch_memory + +The new property defined by +link:../experimental/sycl_ext_oneapi_work_group_scratch_memory.asciidoc[sycl_ext_oneapi_work_group_scratch_memory] +cannot be used in graph nodes. A synchronous exception will be thrown with error +code `invalid` if a user tries to add them to a graph. + +Removing this restriction is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + == Examples and Usage Guide Detailed code examples and usage guidelines are provided in the diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_scratch_memory.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_scratch_memory.asciidoc new file mode 100644 index 0000000000000..757406b068fa4 --- /dev/null +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_scratch_memory.asciidoc @@ -0,0 +1,188 @@ += sycl_ext_oneapi_work_group_scratch_memory + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + + +== Notice + +[%hardbreaks] +Copyright (C) 2024 Intel Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + + +== Dependencies + +This extension is written against the SYCL 2020 revision 9 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + +The following extensions are required: + +- link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties] + +- link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_kernel_properties] + + +== Status + +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in {dpcpp}, but they are not finalized and may +change incompatibly in future versions of {dpcpp} without prior notice. +*Shipping software products should not rely on APIs defined in this +specification.* + + +== Overview + +This extension adds a way to allocate device local memory, without explicitly passing a +kernel argument: `get_work_group_scratch_memory`. It provides access to a dynamically sized +buffer without passing it as an argument to the kernel. +Device local memory is memory that is shared by all work-items in a work-group. +The behavior is similar to the usage of unbounded array with the CUDA `+__shared__+` keyword. + + +== Specification + +=== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY` to one of the values defined in the +table below. Applications can test for the existence of this macro to +determine if the implementation supports this feature, or applications can test +the macro's value to determine which of the extension's features the +implementation supports. + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|The APIs of this experimental extension are not versioned, so the + feature-test macro always has this value. +|=== + + +=== `get_work_group_scratch_memory` function + +The `get_work_group_scratch_memory` function provides access +to a dynamically allocated buffer in the device local memory. + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + void* get_work_group_scratch_memory(); +} +---- + +_Returns_: A pointer to a dynamically allocated buffer + in the device local memory. + +The size of the allocation is unknown at compile-time, +and must be communicated to the SYCL implementation via the +`work_group_scratch_size` property. Every call to +`get_work_group_scratch_memory` returns the same allocation +in device local memory. + +=== Kernel properties + +The `work_group_scratch_size` property must be passed to a kernel to determine +the run-time size of the device local memory allocation associated with +all `get_work_group_scratch_memory` calls. + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +struct work_group_scratch_size { + constexpr work_group_scratch_size(size_t bytes) : value(bytes) {} + size_t value; +}; // work_group_scratch_size + +using work_group_scratch_size_key = work_group_scratch_size; + +template <> struct is_property_key : std::true_type {}; + +} // namespace sycl::ext::oneapi::experimental +---- + +|=== +|Property|Description + +|`work_group_scratch_size` +|The `work_group_scratch_size` property describes the amount of dynamic +device local memory required by the kernel in bytes. + +|=== + +=== Total allocation check + +If the total amount of device local memory requested exceeds a device's +local memory capacity as reported by `info::device::local_mem_size` +then the implementation must throw a synchronous exception with the +`errc::memory_allocation` error code from the kernel invocation command +(e.g. `parallel_for`). This check must take all APIs that allocation device +local memory into account, whether via the `work_group_scratch_size` property +or other APIs such as `local_accessor`. + +== Example + +[source,c++] +---- +namespace syclex = sycl::ext::oneapi::experimental; + +... + +q.parallel_for(sycl::nd_range<1>{N, M}, + syclex::properties{syclex::work_group_scratch_size(M * sizeof(int))}, + [=](sycl::nd_item<1> it) { + auto ptr = syclex::get_work_group_scratch_memory(); + auto ptr2 = syclex::get_work_group_scratch_memory(); +}); +---- + + +== Implementation notes + +This non-normative section provides information about one possible +implementation of this extension. It is not part of the specification of the +extension's API. + +For `get_work_group_scratch_memory`, +the implementation may need to generate some additional code to +appropriately initialize the pointer(s) returned by the call. +Alternatively, it may be possible to initialize the pointer to the beginning +of the device's local memory region (if that value is known). Either way, the +implementation must account for the existence of one or more `local_accessor` +objects (which themselves may allocate a dynamic amount of device local +memory). + + +== Issues + diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_static.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_static.asciidoc new file mode 100644 index 0000000000000..709c890ffdd86 --- /dev/null +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_static.asciidoc @@ -0,0 +1,236 @@ += sycl_ext_oneapi_work_group_static + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + + +== Notice + +[%hardbreaks] +Copyright (C) 2023 Intel Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + + +== Dependencies + +This extension is written against the SYCL 2020 revision 9 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + + +== Status + +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in {dpcpp}, but they are not finalized and may +change incompatibly in future versions of {dpcpp} without prior notice. +*Shipping software products should not rely on APIs defined in this +specification.* + +== Overview + +This extension adds a way to allocate device local memory, without passing a +kernel argument: `work_group_static`. +Device local memory is memory that is shared by all work-items in a work-group. +The behavior is similar to the CUDA `+__shared__+` keyword, and the extension +draws some inspiration from the {cpp} `thread_local` keyword. + +`work_group_static` can only be used to declare variables at namespace, block or class scope, +lifting many of the restrictions in the existing +link:../supported/sycl_ext_oneapi_local_memory.asciidoc[sycl_ext_oneapi_local_memory] +extension. Note, however, that `work_group_static` variables currently place +additional limits on the types that can be allocated, owing to differences in +constructor behavior. + +== Specification + +=== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_ONEAPI_WORK_GROUP_STATIC` to one of the values defined in the +table below. Applications can test for the existence of this macro to +determine if the implementation supports this feature, or applications can test +the macro's value to determine which of the extension's features the +implementation supports. + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|The APIs of this experimental extension are not versioned, so the + feature-test macro always has this value. +|=== + + +=== `work_group_static` class template + +The `work_group_static` class template provides storage of +an object into device local memory. + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +class work_group_static final { +public: + + work_group_static() = default; + work_group_static(const work_group_static&) = delete; + work_group_static& operator=(const work_group_static&) = delete; + + operator T&() const noexcept; + + // Available only if: std::is_array_v == false + const work_group_static& operator=(const T& value) const noexcept; + + T* operator&() const noexcept; +}; + +} // namespace sycl::ext::oneapi::experimental +---- + +`T` must be cv-unqualified, trivially constructible and trivially destructible. + +The storage for the object is allocated in device local memory before +calling the user's kernel lambda, and deallocated when all work-items +in the work-group have completed execution of the kernel. + +Objects of type `work_group_static` must only be declared at namespace, block, lambda or class scope. +If the object is declared in class scope, it must be declared as a static data member. + +SYCL implementations conforming to the full feature set treat +`work_group_static` similarly to the `thread_local` keyword, and when +a `work_group_static` object is declared at block scope it behaves +as if the `static` keyword was specified implicitly. SYCL implementations +conforming to the reduced feature set require the `static` keyword to be +specified explicitly. + +[NOTE] +==== +If a `work_group_static` object is declared at a block scope, the +device local memory associated with the object will be identical for all +usages of that function within the kernel. In cases where a function is called +multiple times, developers must take care to avoid race conditions (e.g., by +calling `group_barrier` before and after using the memory). +==== + +SYCL 2020 requires that all global variables accessed by a device function are +`const` or `constexpr`. This extension lifts that restriction for +`work_group_static` variables. + +Each instance of `work_group_static` is associated +with a unique allocation in device local memory. + +[source,c++] +---- +operator T&() noexcept; +---- +_Returns_: A reference to the object stored in the device local memory +associated with this instance of `work_group_static`. + +[source,c++] +---- +work_group_static& operator=(const T& value) noexcept; +---- +_Constraints_: Available only if `std::is_array_v` is false. + +_Effects_: Copy `value` into the storage of the `work_group_static` instance. + +_Returns_: A reference to this instance of `work_group_static`. + +[source,c++] +---- +T* operator&() noexcept; +---- +_Returns_: A pointer to the device local memory associated with this +instance of `work_group_static`. + +==== Interaction with common address space deduction rules + +Objects of type `work_group_static` are assigned to +the local address space. + + +=== Total allocation check + +If the total amount of device local memory requested exceeds a device's +local memory capacity as reported by `info::device::local_mem_size` +then the implementation must throw a synchronous exception with the +`errc::memory_allocation` error code from the kernel invocation command +(e.g. `parallel_for`). This check must take all APIs that allocation device +local memory into account, whether via the `work_group_scratch_size` property +or other APIs such as `local_accessor`. + +== Example + +[source,c++] +---- +namespace syclex = sycl::ext::oneapi::experimental; + +/* optional: static */ syclex::work_group_static program_scope_scalar; +/* optional: static */ syclex::work_group_static program_scope_array; + +class ClassScope { + static syclex::work_group_static class_scope_scalar; +}; + +syclex::work_group_static ClassScope::class_scope_scalar; + +void foo() { + /* optional: static */ syclex::work_group_static function_scope_scalar; + function_scope_scalar = 1; // assignment via overloaded = operator + function_scope_scalar += 2; // += operator via implicit conversion to int& + class_scope_scalar = 3; + int* ptr = &function_scope_scalar; // conversion to pointer via overloaded & operator +} + +void bar() { + /* optional: static */ syclex::work_group_static function_scope_array; + function_scope_array[0] = 1; // [] operator via implicit conversion to int(&)[64] + int* ptr = function_scope_array; // conversion to pointer via implicit conversion to int(&)[64] +} +---- + + +== Issues + +* We should clean up the wording regarding the scopes at which + `work_group_static` variables may be declared. + The current wording says they may be "allocated at global or function scope". + However, "function scope" is not a {cpp} term. + I assume we meant "block scope" here? + I assume we also meant "namespace scope" instead of "global scope"? + What about class scope or lambda scope? + Are we intentionally omitting those, or is that an oversight? + Are there any scopes where a `work_group_static` variable may not be declared? + If not, we should just say that they may be allocated at any scope. +** Extension changed to use namespace, block, class and lambda scopes +** Require `work_group_static` objects to be declared as static data members if used in a class scope +** `Are there any scopes where a `work_group_static` variable may not be declared?` yes, function parameter scope and non-static data members (just like for `thread_storage`) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_static.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_static.asciidoc deleted file mode 100644 index ae8b0e63fe366..0000000000000 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_static.asciidoc +++ /dev/null @@ -1,330 +0,0 @@ -= sycl_ext_oneapi_work_group_static - -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en -:dpcpp: pass:[DPC++] - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. -:language: {basebackend@docbook:c++:cpp} - - -== Notice - -[%hardbreaks] -Copyright (C) 2023 Intel Corporation. All rights reserved. - -Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks -of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by -permission by Khronos. - - -== Contact - -To report problems with this extension, please open a new issue at: - -https://github.com/intel/llvm/issues - - -== Dependencies - -This extension is written against the SYCL 2020 revision 8 specification. All -references below to the "core SYCL specification" or to section numbers in the -SYCL specification refer to that revision. - -The following extensions are required only for dynamic allocations: - -- link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties] - -- link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_kernel_properties] - - -== Status - -This is a proposed extension specification, intended to gather community -feedback. Interfaces defined in this specification may not be implemented yet -or may be in a preliminary state. The specification itself may also change in -incompatible ways before it is finalized. *Shipping software products should -not rely on APIs defined in this specification.* - - -== Overview - -This extension adds a way to allocate device local memory, without passing a -kernel argument. -Device local memory is memory that is shared by all work-items in a work-group. -The behavior is similar to the CUDA `+__shared__+` keyword, and the extension -draws some inspiration from the {cpp} `thread_local` keyword. - -`work_group_static` variables can be allocated at global or function scope, -lifting many of the restrictions in the existing -link:../supported/sycl_ext_oneapi_local_memory.asciidoc[sycl_ext_oneapi_local_memory] -extension. Note, however, that `work_group_static` variables currently place -additional limits on the types that can be allocated, owing to differences in -constructor behavior. - - -== Specification - -=== Feature test macro - -This extension provides a feature-test macro as described in the core SYCL -specification. An implementation supporting this extension must predefine the -macro `SYCL_EXT_ONEAPI_WORK_GROUP_SPECIFIC` to one of the values defined in the -table below. Applications can test for the existence of this macro to -determine if the implementation supports this feature, or applications can test -the macro's value to determine which of the extension's features the -implementation supports. - -[%header,cols="1,5"] -|=== -|Value -|Description - -|1 -|The APIs of this experimental extension are not versioned, so the - feature-test macro always has this value. -|=== - - -=== `work_group_static` class template - -The `work_group_static` class template acts as a view of an -implementation-managed pointer to device local memory. - -[source,c++] ----- -namespace sycl::ext::oneapi::experimental { - -template -class work_group_static { -public: - - work_group_static() = default; - work_group_static(const work_group_static&) = delete; - work_group_static& operator=(const work_group_static&) = delete; - - operator T&() const noexcept; - - // Available only if: std::is_array_v == false - const work_group_static& operator=(const T& value) const noexcept; - - T* operator&() const noexcept; - -private: - T* ptr; // exposition only - -}; - -} // namespace sycl::ext::oneapi::experimental ----- - -`T` must be one of the following: - -* A trivially constructible and trivially destructible type, or -* An unbounded array of type `U`, where `U` is a trivially constructible and - trivially destructible type. - -The storage for the object is allocated in device local memory before -calling the user's kernel lambda, and deallocated when all work-items -in the work-group have completed execution of the kernel. - -SYCL implementations conforming to the full feature set treat -`work_group_static` similarly to the `thread_local` keyword, and when -a `work_group_static` object is declared at block scope it behaves -as if the `static` keyword was specified implicitly. SYCL implementations -conforming to the reduced feature set require the `static` keyword to be -specified explicitly. - -[NOTE] -==== -If a `work_group_static` object is declared at function scope, the -device local memory associated with the object will be identical for all -usages of that function within the kernel. In cases where a function is called -multiple times, developers must take care to avoid race conditions (e.g., by -calling `group_barrier` before and after using the memory). -==== - -SYCL 2020 requires that all global variables accessed by a device function are -`const` or `constexpr`. This extension lifts that restriction for -`work_group_static` variables. - -[NOTE] -==== -Since `work_group_static` acts as a view, wrapping an underlying pointer, a -developer may still choose to declare variables as `const`. -==== - -When `T` is a class type or bounded array, the size of the allocation is known -at compile-time, and a SYCL implementation may embed the size of the allocation -directly within a kernel. Each instance of `work_group_static` is associated -with a unique allocation in device local memory. - -When `T` is an unbounded array, the size of the allocation is unknown at -compile-time, and must be communicated to the SYCL implementation via the -`work_group_static_memory_size` property. Every instance of `work_group_static` -for which `T` is an unbounded array is associated with a single, shared, -allocation in device local memory. For example, two instances declared -as `work_group_static` and `work_group_static` will be -associated with the same shared allocation. - -If the total amount of device local memory requested (i.e., the sum of -all memory requested by `local_accessor`, `group_local_memory`, -`group_local_memory_for_overwrite` and `work_group_static`) exceeds a device's -local memory capacity (as reported by `local_mem_size`) then the implementation -must throw a synchronous `exception` with the `errc::memory_allocation` error -code from the kernel invocation command (e.g. `parallel_for`). - -[source,c++] ----- -operator T&() const noexcept; ----- -_Returns_: A reference to the object stored in the device local memory -associated with this instance of `work_group_static`. - -[source,c++] ----- -const work_group_static& operator=(const T& value) const noexcept; ----- -_Constraints_: Available only if `std::is_array_v>` is false. - -_Effects_: Replaces the value referenced by `*ptr` with `value`. - -_Returns_: A reference to this instance of `work_group_static`. - -[source,c++] ----- -T* operator&() const noexcept; ----- -_Returns_: A pointer to the device local memory associated with this -instance of `work_group_static` (i.e., `ptr`). - - -==== Kernel properties - -The `work_group_static_size` property must be passed to a kernel to determine -the run-time size of the device local memory allocation associated with -all `work_group_static` variables of unbounded array type. - -[source,c++] ----- -namespace sycl::ext::oneapi::experimental { - -struct work_group_static_size { - constexpr work_group_static_size(size_t bytes) : value(bytes) {} - size_t value; -}; // work_group_static_size - -using work_group_static_size_key = work_group_static_size; - -template <>struct is_property_key : std::true_type {}; - -} // namespace sycl::ext::oneapi::experimental ----- - -|=== -|Property|Description - -|`work_group_static_size` -|The `work_group_static_size` property describes the amount of dynamic -device local memory required by the kernel in bytes. - -|=== - - -==== Usage examples - -===== Allocations with size known at compile-time - -[source,c++] ----- -using namespace syclex = sycl::ext::oneapi::experimental; - -/* optional: static const */ syclex::work_group_static program_scope_scalar; -/* optional: static const */ syclex::work_group_static program_scope_array; - -void foo() { - /* optional: static const */ syclex::work_group_static function_scope_scalar; - function_scope_scalar = 1; // assignment via overloaded = operator - function_scope_scalar += 2; // += operator via implicit conversion to int& - int* ptr = &function_scope_scalar; // conversion to pointer via overloaded & operator -} - -void bar() { - /* optional: static const */ sylex::work_group_static function_scope_array; - function_scope_array[0] = 1; // [] operator via implicit conversion to int(&)[64] - int* ptr = function_scope_array; // conversion to pointer via implicit conversion to int(&)[64] -} ----- - -===== Allocations with size unknown at compile-time - -[source,c++] ----- -using namespace syclex = sycl::ext::oneapi::experimental; - -/* optional: static const */ syclex::work_group_static dynamic_program_scope_array; - -... - -q.parallel_for(sycl::nd_range<1>{N, M}, - syclex::properties{syclex::work_group_static_size(M * sizeof(int))}, - [=](sycl::nd_item<1> it) { - ... -}); ----- - - -== Implementation notes - -This non-normative section provides information about one possible -implementation of this extension. It is not part of the specification of the -extension's API. - -For class types and bounded arrays, the class can be implemented on top of -the existing `__sycl_allocateLocalMemory` intrinsic: -[source,c++] ----- -#ifdef __SYCL_DEVICE_ONLY__ - __attribute__((opencl_local)) T *ptr = reinterpret_cast<__attribute__((opencl_local)) T *>(__sycl_allocateLocalMemory(sizeof(T), alignof(T))); -#else - T *ptr{}; -#endif ----- - -Note, however, that implementing the correct semantics may require some -adjustment to the handling of this intrinsic. A simple class as written above -would create a separate allocation for every call to an inlined function. -Creating device local memory allocations should be handled before inlining to -prevent this. - -For unbounded arrays, a separate specialization of the class will be required, -and the implementation may need to generate some additional code to -appropriately initialize the pointer(s) wrapped by `work_group_static` objects. -Alternatively, it may be possible to initialize the pointer to the beginning -of the device's local memory region (if that value is known). Either way, the -implementation must account for the existence of one or more `local_accessor` -objects (which themselves may allocate a dynamic amount of device local -memory). - - -== Issues - -* We should clean up the wording regarding the scopes at which - `work_group_static` variables may be declared. - The current wording says they may be "allocated at global or function scope". - However, "function scope" is not a {cpp} term. - I assume we meant "block scope" here? - I assume we also meant "namespace scope" instead of "global scope"? - What about class scope or lambda scope? - Are we intentionally omitting those, or is that an oversight? - Are there any scopes where a `work_group_static` variable may not be declared? - If not, we should just say that they may be allocated at any scope. diff --git a/sycl/include/sycl/detail/sycl_local_mem_builtins.hpp b/sycl/include/sycl/detail/sycl_local_mem_builtins.hpp new file mode 100644 index 0000000000000..c1162f309291f --- /dev/null +++ b/sycl/include/sycl/detail/sycl_local_mem_builtins.hpp @@ -0,0 +1,22 @@ +//==----- sycl_local_mem_builtins.hpp --- SYCL local memory builtins ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include // for __DPCPP_SYCL_EXTERNAL + +#ifdef __SYCL_DEVICE_ONLY__ +// Request a fixed-size allocation in local address space at kernel scope. +// Required for group_local_memory and work_group_static. +extern "C" __DPCPP_SYCL_EXTERNAL __attribute__((opencl_local)) std::uint8_t * +__sycl_allocateLocalMemory(std::size_t Size, std::size_t Alignment); +// Request a placeholder for a dynamically-sized buffer in local address space +// at kernel scope. Required for work_group_static. +extern "C" __DPCPP_SYCL_EXTERNAL __attribute__((opencl_local)) std::uint8_t * +__sycl_dynamicLocalMemoryPlaceholder(); +#endif diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 2bc3ef1d921ab..1756615f87265 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -59,7 +59,8 @@ enum class UnsupportedGraphFeatures { sycl_ext_oneapi_device_global = 6, sycl_ext_oneapi_bindless_images = 7, sycl_ext_oneapi_experimental_cuda_cluster_launch = 8, - sycl_ext_codeplay_enqueue_native_command = 9 + sycl_ext_codeplay_enqueue_native_command = 9, + sycl_ext_oneapi_work_group_scratch_memory = 10 }; inline const char * @@ -86,6 +87,8 @@ UnsupportedFeatureToString(UnsupportedGraphFeatures Feature) { return "sycl_ext_oneapi_experimental_cuda_cluster_launch"; case UGF::sycl_ext_codeplay_enqueue_native_command: return "sycl_ext_codeplay_enqueue_native_command"; + case UGF::sycl_ext_oneapi_work_group_scratch_memory: + return "sycl_ext_oneapi_work_group_scratch_memory"; } assert(false && "Unhandled graphs feature"); diff --git a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp index 916e5849054a1..6e65b9acffe8e 100644 --- a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp @@ -9,19 +9,14 @@ #include // for address_space, decorated #include // for __SYCL_ALWAYS_INLINE -#include // for is_group -#include // for exception -#include // for multi_ptr -#include // for workGroupBarrier +#include // for __sycl_allocateLocalMemory +#include // for is_group +#include // for exception +#include // for multi_ptr +#include // for workGroupBarrier #include // for enable_if_t -#ifdef __SYCL_DEVICE_ONLY__ -// Request a fixed-size allocation in local address space at kernel scope. -extern "C" __DPCPP_SYCL_EXTERNAL __attribute__((opencl_local)) std::uint8_t * -__sycl_allocateLocalMemory(std::size_t Size, std::size_t Alignment); -#endif - namespace sycl { inline namespace _V1 { namespace ext::oneapi { diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp index 2483d06c73b5a..d4c45bc645e11 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp @@ -221,8 +221,9 @@ enum PropKind : uint32_t { Prefetch = 76, Deterministic = 77, InitializeToIdentity = 78, + WorkGroupScratchSize = 79, // PropKindSize must always be the last value. - PropKindSize = 79, + PropKindSize = 80, }; template struct PropertyToKind { diff --git a/sycl/include/sycl/ext/oneapi/work_group_scratch_memory.hpp b/sycl/include/sycl/ext/oneapi/work_group_scratch_memory.hpp new file mode 100644 index 0000000000000..bf7cc05a728bb --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/work_group_scratch_memory.hpp @@ -0,0 +1,54 @@ +//==--- work_group_scratch_memory.hpp - SYCL group local memory extension --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include // for __SYCL_ALWAYS_INLINE +#include // for __sycl_allocateLocalMemory +#include // for exception +#include // for properties + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi { +namespace experimental { + +__SYCL_ALWAYS_INLINE +inline void *get_work_group_scratch_memory() { +#ifdef __SYCL_DEVICE_ONLY__ + return __sycl_dynamicLocalMemoryPlaceholder(); +#else + throw sycl::exception( + sycl::errc::feature_not_supported, + "sycl_ext_oneapi_work_scratch_memory extension is not supported on host"); +#endif +} + +// Property +struct work_group_scratch_size + : ::sycl::ext::oneapi::experimental::detail::run_time_property_key< + work_group_scratch_size, ::sycl::ext::oneapi::experimental::detail:: + PropKind::WorkGroupScratchSize> { + // Runtime property part + constexpr work_group_scratch_size(size_t bytes) : size(bytes) {} + + size_t size; +}; + +using work_group_scratch_size_key = work_group_scratch_size; + +namespace detail { +template <> struct PropertyMetaInfo { + static constexpr const char *name = "sycl-work-group-static"; + static constexpr int value = 1; +}; + +} // namespace detail +} // namespace experimental +} // namespace ext::oneapi +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/work_group_static.hpp b/sycl/include/sycl/ext/oneapi/work_group_static.hpp new file mode 100644 index 0000000000000..4425b05e9cff7 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/work_group_static.hpp @@ -0,0 +1,62 @@ +//==----- work_group_static.hpp --- SYCL group local memory extension -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include // for __SYCL_ALWAYS_INLINE +#include // for exception + +#include // for enable_if_t, is_trivially_destructible_v ... + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi { +namespace experimental { + +#ifdef __SYCL_DEVICE_ONLY__ +#define __SYCL_WG_SCOPE [[__sycl_detail__::wg_scope]] +#else +#define __SYCL_WG_SCOPE +#endif + +/// @brief Allocate data in device local memory. +/// Any work_group_static object will be place in device local memory and hold +/// an object of type T. work_group_static object are implicitly treated as +/// static. +/// @tparam T must be a trivially constructible and destructible type +template class __SYCL_WG_SCOPE work_group_static final { +public: + static_assert( + std::is_trivially_destructible_v && + std::is_trivially_constructible_v, + "Can only be used with trivially constructible and destructible types"); + static_assert(!std::is_const_v && !std::is_volatile_v, + "Can only be used with non const and non volatile types"); + __SYCL_ALWAYS_INLINE work_group_static() = default; + work_group_static(const work_group_static &) = delete; + work_group_static &operator=(const work_group_static &) = delete; + + operator T &() noexcept { return data; } + + template >> + work_group_static &operator=(const T &value) noexcept { + data = value; + return *this; + } + + T *operator&() noexcept { return &data; } + +private: + T data; +}; + +#undef __SYCL_WG_SCOPE + +} // namespace experimental +} // namespace ext::oneapi +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 4e8f62d53c36d..99c8e5fd9af15 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -824,6 +825,14 @@ class __SYCL_EXPORT handler { prop.coordinationScope); } + if constexpr (PropertiesT::template has_property< + sycl::ext::oneapi::experimental:: + work_group_scratch_size>()) { + auto WorkGroupMemSize = Props.template get_property< + sycl::ext::oneapi::experimental::work_group_scratch_size>(); + setKernelWorkGroupMem(WorkGroupMemSize.size); + } + checkAndSetClusterRange(Props); } @@ -3550,6 +3559,9 @@ class __SYCL_EXPORT handler { // Set using cuda thread block cluster launch flag and set the launch bounds. void setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims); + // Set the request work group memory size (work_group_static ext). + void setKernelWorkGroupMem(size_t Size); + // Various checks that are only meaningful for host compilation, because they // result in runtime errors (i.e. exceptions being thrown). To save time // during device compilations (by reducing amount of templates we have to diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index f0dadad99dac5..5f08485159187 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -263,6 +263,7 @@ class CGExecKernel : public CG { ur_kernel_cache_config_t MKernelCacheConfig; bool MKernelIsCooperative = false; bool MKernelUsesClusterLaunch = false; + size_t MKernelWorkGroupMemorySize = 0; CGExecKernel(NDRDescT NDRDesc, std::shared_ptr HKernel, std::shared_ptr SyclKernel, @@ -273,7 +274,7 @@ class CGExecKernel : public CG { std::vector> AuxiliaryResources, CGType Type, ur_kernel_cache_config_t KernelCacheConfig, bool KernelIsCooperative, bool MKernelUsesClusterLaunch, - detail::code_location loc = {}) + size_t KernelWorkGroupMemorySize, detail::code_location loc = {}) : CG(Type, std::move(CGData), std::move(loc)), MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)), MSyclKernel(std::move(SyclKernel)), @@ -282,7 +283,8 @@ class CGExecKernel : public CG { MAuxiliaryResources(std::move(AuxiliaryResources)), MAlternativeKernels{}, MKernelCacheConfig(std::move(KernelCacheConfig)), MKernelIsCooperative(KernelIsCooperative), - MKernelUsesClusterLaunch(MKernelUsesClusterLaunch) { + MKernelUsesClusterLaunch(MKernelUsesClusterLaunch), + MKernelWorkGroupMemorySize(KernelWorkGroupMemorySize) { assert(getType() == CGType::Kernel && "Wrong type of exec kernel CG."); } diff --git a/sycl/source/detail/compiler.hpp b/sycl/source/detail/compiler.hpp index 35f313ceec3f5..40bf97299138f 100644 --- a/sycl/source/detail/compiler.hpp +++ b/sycl/source/detail/compiler.hpp @@ -66,6 +66,8 @@ #define __SYCL_PROPERTY_SET_SYCL_HOST_PIPES "SYCL/host pipes" /// PropertySetRegistry::SYCL_VIRTUAL_FUNCTIONS defined in PropertySetIO.h #define __SYCL_PROPERTY_SET_SYCL_VIRTUAL_FUNCTIONS "SYCL/virtual functions" +/// PropertySetRegistry::SYCL_IMPLICIT_LOCAL_ARG defined in PropertySetIO.h +#define __SYCL_PROPERTY_SET_SYCL_IMPLICIT_LOCAL_ARG "SYCL/implicit local arg" /// Program metadata tags recognized by the PI backends. For kernels the tag /// must appear after the kernel name. diff --git a/sycl/source/detail/device_binary_image.cpp b/sycl/source/detail/device_binary_image.cpp index 2be48d4a38fce..633a4269e1e78 100644 --- a/sycl/source/detail/device_binary_image.cpp +++ b/sycl/source/detail/device_binary_image.cpp @@ -182,6 +182,7 @@ void RTDeviceBinaryImage::init(sycl_device_binary Bin) { DeviceLibReqMask.init(Bin, __SYCL_PROPERTY_SET_DEVICELIB_REQ_MASK); KernelParamOptInfo.init(Bin, __SYCL_PROPERTY_SET_KERNEL_PARAM_OPT_INFO); AssertUsed.init(Bin, __SYCL_PROPERTY_SET_SYCL_ASSERT_USED); + ImplicitLocalArg.init(Bin, __SYCL_PROPERTY_SET_SYCL_IMPLICIT_LOCAL_ARG); ProgramMetadata.init(Bin, __SYCL_PROPERTY_SET_PROGRAM_METADATA); // Convert ProgramMetadata into the UR format for (const auto &Prop : ProgramMetadata) { diff --git a/sycl/source/detail/device_binary_image.hpp b/sycl/source/detail/device_binary_image.hpp index 203427b89ca45..1053ea72668b6 100644 --- a/sycl/source/detail/device_binary_image.hpp +++ b/sycl/source/detail/device_binary_image.hpp @@ -230,6 +230,7 @@ class RTDeviceBinaryImage { } const PropertyRange &getHostPipes() const { return HostPipes; } const PropertyRange &getVirtualFunctions() const { return VirtualFunctions; } + const PropertyRange &getImplicitLocalArg() const { return ImplicitLocalArg; } std::uintptr_t getImageID() const { assert(Bin && "Image ID is not available without a binary image."); @@ -255,6 +256,7 @@ class RTDeviceBinaryImage { RTDeviceBinaryImage::PropertyRange DeviceRequirements; RTDeviceBinaryImage::PropertyRange HostPipes; RTDeviceBinaryImage::PropertyRange VirtualFunctions; + RTDeviceBinaryImage::PropertyRange ImplicitLocalArg; std::vector ProgramMetadataUR; diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 861ec2a883601..489f5a46787ff 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1080,7 +1080,7 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, nullptr, // TODO: Extract from handler UR_KERNEL_CACHE_CONFIG_DEFAULT, CG->MKernelIsCooperative, - CG->MKernelUsesClusterLaunch); + CG->MKernelUsesClusterLaunch, CG->MKernelWorkGroupMemorySize); ScheduledEvents.push_back(NewEvent); } else if (!NodeImpl->isEmpty()) { // Empty nodes are node processed as other nodes, but only their diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index e452eca0c8a6d..cf776bcbc2cc3 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -121,6 +121,7 @@ class handler_impl { bool MKernelIsCooperative = false; bool MKernelUsesClusterLaunch = false; + uint32_t MKernelWorkGroupMemorySize = 0; // Extra information for bindless image copy ur_image_desc_t MSrcImageDesc = {}; diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 19f1915943f05..355d5bb40e70a 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -1021,7 +1021,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr), std::move(CGData), std::move(FusedArgs), FusedOrCachedKernelName, {}, {}, CGType::Kernel, KernelCacheConfig, false /* KernelIsCooperative */, - false /* KernelUsesClusterLaunch*/)); + false /* KernelUsesClusterLaunch*/, 0 /* KernelWorkGroupMemorySize */)); return FusedCG; } diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 8f13c0745ad21..4796d997ae58e 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -1717,6 +1717,24 @@ bool ProgramManager::kernelUsesAssert(const std::string &KernelName) const { return m_KernelUsesAssert.find(KernelName) != m_KernelUsesAssert.end(); } +void ProgramManager::cacheKernelImplicitLocalArg(RTDeviceBinaryImage &Img) { + const RTDeviceBinaryImage::PropertyRange &ImplicitLocalArgRange = + Img.getImplicitLocalArg(); + if (ImplicitLocalArgRange.isAvailable()) + for (auto Prop : ImplicitLocalArgRange) { + m_KernelImplicitLocalArgPos[Prop->Name] = + DeviceBinaryProperty(Prop).asUint32(); + } +} + +std::optional +ProgramManager::kernelImplicitLocalArgPos(const std::string &KernelName) const { + auto it = m_KernelImplicitLocalArgPos.find(KernelName); + if (it != m_KernelImplicitLocalArgPos.end()) + return it->second; + return {}; +} + void ProgramManager::addImages(sycl_device_binaries DeviceBinary) { const bool DumpImages = std::getenv("SYCL_DUMP_IMAGES") && !m_UseSpvFile; for (int I = 0; I < DeviceBinary->NumDeviceBinaries; I++) { @@ -1827,6 +1845,8 @@ void ProgramManager::addImages(sycl_device_binaries DeviceBinary) { Prop && (detail::DeviceBinaryProperty(Prop).asUint32() != 0); } + cacheKernelImplicitLocalArg(*Img); + // Sort kernel ids for faster search std::sort(m_BinImg2KernelIDs[Img.get()]->begin(), m_BinImg2KernelIDs[Img.get()]->end(), LessByHash{}); diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 0586a41a83540..60d43a87e3fa6 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -292,6 +292,9 @@ class ProgramManager { bool kernelUsesAssert(const std::string &KernelName) const; + std::optional + kernelImplicitLocalArgPos(const std::string &KernelName) const; + bool kernelUsesAsan() const { return m_AsanFoundInImage; } std::set @@ -317,6 +320,9 @@ class ProgramManager { /// Add info on kernels using assert into cache void cacheKernelUsesAssertInfo(RTDeviceBinaryImage &Img); + /// Add info on kernels using local arg into cache + void cacheKernelImplicitLocalArg(RTDeviceBinaryImage &Img); + std::set collectDeviceImageDepsForImportedSymbols(const RTDeviceBinaryImage &Img, device Dev); @@ -418,6 +424,7 @@ class ProgramManager { RTDeviceBinaryImageUPtr m_SpvFileImage; std::set m_KernelUsesAssert; + std::unordered_map m_KernelImplicitLocalArgPos; // True iff there is a device image compiled with AddressSanitizer bool m_AsanFoundInImage; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 091504a983ff3..d23e94e9f97b8 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2427,7 +2427,8 @@ static ur_result_t SetKernelParamsAndLaunch( const KernelArgMask *EliminatedArgMask, const std::function &getMemAllocationFunc, bool IsCooperative, bool KernelUsesClusterLaunch, - const RTDeviceBinaryImage *BinImage, const std::string &KernelName) { + uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage, + const std::string &KernelName) { assert(Queue && "Kernel submissions should have an associated queue"); const AdapterPtr &Adapter = Queue->getAdapter(); @@ -2447,6 +2448,17 @@ static ur_result_t SetKernelParamsAndLaunch( applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); + std::optional ImplicitLocalArg = + ProgramManager::getInstance().kernelImplicitLocalArgPos(KernelName); + // Set the implicit local memory buffer to support + // get_work_group_scratch_memory. This is for backend not supporting + // CUDA-style local memory setting. Note that we may have -1 as a position, + // this indicates the buffer is actually unused and was elided. + if (ImplicitLocalArg.has_value() && ImplicitLocalArg.value() != -1) { + Adapter->call( + Kernel, ImplicitLocalArg.value(), WorkGroupMemorySize, nullptr); + } + adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr())); // Remember this information before the range dimensions are reversed @@ -2474,9 +2486,8 @@ static ur_result_t SetKernelParamsAndLaunch( } if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); + std::vector property_list; if (KernelUsesClusterLaunch) { - std::vector property_list; - ur_exp_launch_property_value_t launch_property_value_cluster_range; launch_property_value_cluster_range.clusterDim[0] = NDRDesc.ClusterDimensions[0]; @@ -2494,13 +2505,20 @@ static ur_result_t SetKernelParamsAndLaunch( property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE, launch_property_value_cooperative}); } - + } + // If there is no implicit arg, let the driver handle it via a property + if (WorkGroupMemorySize && !ImplicitLocalArg.has_value()) { + property_list.push_back( + {UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY, WorkGroupMemorySize}); + } + if (!property_list.empty()) { ur_event_handle_t UREvent = nullptr; ur_result_t Error = Adapter->call_nocheck( - Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalSize[0], - LocalSize, property_list.size(), property_list.data(), - RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], + Queue->getHandleRef(), Kernel, NDRDesc.Dims, + &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize, + property_list.size(), property_list.data(), RawEvents.size(), + RawEvents.empty() ? nullptr : &RawEvents[0], OutEventImpl ? &UREvent : nullptr); if (OutEventImpl) { OutEventImpl->setHandle(UREvent); @@ -2692,7 +2710,8 @@ void enqueueImpKernel( const detail::EventImplPtr &OutEventImpl, const std::function &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative, - const bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage) { + const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize, + const RTDeviceBinaryImage *BinImage) { assert(Queue && "Kernel submissions should have an associated queue"); // Run OpenCL kernel auto ContextImpl = Queue->getContextImplPtr(); @@ -2784,7 +2803,8 @@ void enqueueImpKernel( Error = SetKernelParamsAndLaunch( Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList, OutEventImpl, EliminatedArgMask, getMemAllocationFunc, - KernelIsCooperative, KernelUsesClusterLaunch, BinImage, KernelName); + KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize, + BinImage, KernelName); const AdapterPtr &Adapter = Queue->getAdapter(); if (!SyclKernelImpl && !MSyclKernel) { @@ -3176,7 +3196,8 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { SyclKernel, KernelName, RawEvents, EventImpl, getMemAllocationFunc, ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative, - ExecKernel->MKernelUsesClusterLaunch, BinImage); + ExecKernel->MKernelUsesClusterLaunch, + ExecKernel->MKernelWorkGroupMemorySize, BinImage); return UR_RESULT_SUCCESS; } diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 1aecf5ed4eabb..a49f52fbbc436 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -627,7 +627,7 @@ void enqueueImpKernel( const detail::EventImplPtr &Event, const std::function &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, bool KernelIsCooperative, - const bool KernelUsesClusterLaunch, + const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage = nullptr); /// The exec CG command enqueues execution of kernel or explicit memory diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in index 8f4fb05752efc..06e0541c74c53 100644 --- a/sycl/source/feature_test.hpp.in +++ b/sycl/source/feature_test.hpp.in @@ -109,6 +109,8 @@ inline namespace _V1 { #define SYCL_EXT_ONEAPI_PROFILING_TAG 1 #define SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND 1 #define SYCL_EXT_ONEAPI_GET_KERNEL_INFO 1 +#define SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY 1 +#define SYCL_EXT_ONEAPI_WORK_GROUP_STATIC 1 // In progress yet #define SYCL_EXT_ONEAPI_ATOMIC16 0 diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index a7ac73f9e4c34..476a6e373f2db 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -290,7 +290,8 @@ event handler::finalize() { KernelBundleImpPtr, MKernel, MKernelName.c_str(), RawEvents, NewEvent, nullptr, impl->MKernelCacheConfig, impl->MKernelIsCooperative, - impl->MKernelUsesClusterLaunch, BinImage); + impl->MKernelUsesClusterLaunch, + impl->MKernelWorkGroupMemorySize, BinImage); #ifdef XPTI_ENABLE_INSTRUMENTATION // Emit signal only when event is created if (NewEvent != nullptr) { @@ -349,7 +350,8 @@ event handler::finalize() { std::move(impl->MArgs), MKernelName.c_str(), std::move(MStreamStorage), std::move(impl->MAuxiliaryResources), getType(), impl->MKernelCacheConfig, impl->MKernelIsCooperative, - impl->MKernelUsesClusterLaunch, MCodeLoc)); + impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize, + MCodeLoc)); break; } case detail::CGType::CopyAccToPtr: @@ -1962,6 +1964,12 @@ void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) { impl->MNDRDesc.setClusterDimensions(ClusterSize, Dims); } +void handler::setKernelWorkGroupMem(size_t Size) { + throwIfGraphAssociated(); + impl->MKernelWorkGroupMemorySize = Size; +} + void handler::ext_oneapi_graph( ext::oneapi::experimental::command_graph< ext::oneapi::experimental::graph_state::executable> diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/copy_dynamic_size.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/copy_dynamic_size.cpp new file mode 100644 index 0000000000000..1f61653efc44e --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/copy_dynamic_size.cpp @@ -0,0 +1,56 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// + +// UNSUPPORTED: gpu-intel-gen12 +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072 + +#include +#include +#include +#include + +constexpr size_t Size = 1024; +using DataType = int; + +namespace sycl_ext = sycl::ext::oneapi::experimental; + +void copy_via_smem(DataType *a, DataType *b, sycl::nd_item<1> it) { + // one work-group copies data to shared memory from A + // And then puts in back into B + + DataType *smem_ptr = + reinterpret_cast(sycl_ext::get_work_group_scratch_memory()); + auto threadIdx_x = it.get_local_linear_id(); + + smem_ptr[threadIdx_x] = a[threadIdx_x]; + sycl::group_barrier(it.get_group()); + + b[threadIdx_x] = smem_ptr[threadIdx_x]; +} + +int main() { + sycl::queue queue; + DataType *a = sycl::malloc_device(Size, queue); + DataType *b = sycl::malloc_device(Size, queue); + std::vector a_host(Size, 1.0); + std::vector b_host(Size, -5.0); + + queue.copy(a_host.data(), a, Size).wait_and_throw(); + + queue + .submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<1>({Size}, {Size}), + sycl_ext::properties{sycl_ext::work_group_scratch_size( + Size * sizeof(DataType))}, + [=](sycl::nd_item<1> it) { copy_via_smem(a, b, it); }); + }) + .wait_and_throw(); + + queue.copy(b, b_host.data(), Size).wait_and_throw(); + for (size_t i = 0; i < b_host.size(); i++) { + assert(b_host[i] == a_host[i]); + } + sycl::free(a, queue); + sycl::free(b, queue); +} diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_local_accessor.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_local_accessor.cpp new file mode 100644 index 0000000000000..a407d56077d5c --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_local_accessor.cpp @@ -0,0 +1,74 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// + +// UNSUPPORTED: gpu-intel-gen12 +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072 + +// Test work_group_dynamic extension with allocation size specified at runtime +// and an additional local accessor. + +#include +#include + +#include + +constexpr size_t WgSize = 32; +constexpr size_t WgCount = 4; +constexpr size_t RepeatWG = 16; +constexpr size_t ElemPerWG = WgSize * RepeatWG; +constexpr size_t Size = WgSize * WgCount * RepeatWG; + +using namespace sycl; + +namespace sycl_ext = sycl::ext::oneapi::experimental; + +int main() { + queue Q; + std::vector Vec(Size, 0); + buffer Buf{Vec.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + sycl_ext::work_group_scratch_size static_size(WgSize * RepeatWG * + sizeof(int)); + sycl_ext::properties properties{static_size}; + auto LocalAccessor = + sycl::local_accessor(WgSize * RepeatWG * sizeof(int), Cgh); + Cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties, + [=](nd_item<1> Item) { + int *Ptr = reinterpret_cast( + sycl_ext::get_work_group_scratch_memory()); + size_t GroupOffset = + Item.get_group_linear_id() * ElemPerWG; + for (size_t I = 0; I < RepeatWG; ++I) { + Ptr[WgSize * I + Item.get_local_linear_id()] = + Item.get_local_linear_id(); + } + Item.barrier(); + + for (size_t I = 0; I < RepeatWG; ++I) { + // Check that the local accessor works. + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + LocalAccessor[WgSize * I + LocalIdx] = + Ptr[WgSize * I + LocalIdx] + 1; + } + Item.barrier(); + + for (size_t I = 0; I < RepeatWG; ++I) { + // Check that the memory is accessible from other + // work-items + size_t BaseIdx = GroupOffset + (I * WgSize); + size_t LocalIdx = Item.get_local_linear_id(); + size_t GlobalIdx = BaseIdx + LocalIdx; + Acc[GlobalIdx] = LocalAccessor[WgSize * I + LocalIdx]; + } + }); + }); + + host_accessor Acc(Buf, read_only); + for (size_t I = 0; I < Size; ++I) { + std::cout << I << ": " << Acc[I] << std::endl; + assert(Acc[I] == I % WgSize + 1); + } +} diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_ptr_alias.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_ptr_alias.cpp new file mode 100644 index 0000000000000..e56ccc66bc364 --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_ptr_alias.cpp @@ -0,0 +1,68 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// + +// UNSUPPORTED: gpu-intel-gen12 +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072 + +// Test work_group_dynamic extension with allocation size specified at runtime +// and multiple calls to the extension inside the kernel. + +#include +#include + +#include + +constexpr size_t WgSize = 32; +constexpr size_t WgCount = 4; +constexpr size_t RepeatWG = 16; +constexpr size_t ElemPerWG = WgSize * RepeatWG; +constexpr size_t Size = WgSize * WgCount * RepeatWG; + +using namespace sycl; + +namespace sycl_ext = sycl::ext::oneapi::experimental; + +int main() { + queue Q; + std::vector Vec(Size, 0); + buffer Buf{Vec.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + sycl_ext::work_group_scratch_size static_size(WgSize * RepeatWG * + sizeof(int)); + sycl_ext::properties properties{static_size}; + Cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties, + [=](nd_item<1> Item) { + int *Ptr = reinterpret_cast( + sycl_ext::get_work_group_scratch_memory()); + size_t GroupOffset = + Item.get_group_linear_id() * ElemPerWG; + for (size_t I = 0; I < RepeatWG; ++I) { + Ptr[WgSize * I + Item.get_local_linear_id()] = + Item.get_local_linear_id(); + } + + Item.barrier(); + // Check that multiple calls return the same pointer. + unsigned int *PtrAlias = + reinterpret_cast( + sycl_ext::get_work_group_scratch_memory()); + + for (size_t I = 0; I < RepeatWG; ++I) { + // Check that the memory is accessible from other + // work-items + size_t BaseIdx = GroupOffset + (I * WgSize); + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = BaseIdx + LocalIdx; + Acc[GlobalIdx] = PtrAlias[WgSize * I + LocalIdx]; + } + }); + }); + + host_accessor Acc(Buf, read_only); + for (size_t I = 0; I < Size; ++I) { + assert(Acc[I] == I % WgSize); + } +} diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_allocation.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_allocation.cpp new file mode 100644 index 0000000000000..76032271d7b5c --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_allocation.cpp @@ -0,0 +1,62 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// + +// UNSUPPORTED: gpu-intel-gen12 +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072 + +// Test work_group_dynamic extension with allocation size specified at runtime. + +#include +#include + +#include + +constexpr size_t WgSize = 32; +constexpr size_t WgCount = 4; +constexpr size_t RepeatWG = 16; +constexpr size_t ElemPerWG = WgSize * RepeatWG; +constexpr size_t Size = WgSize * WgCount * RepeatWG; + +using namespace sycl; + +namespace sycl_ext = sycl::ext::oneapi::experimental; + +int main() { + queue Q; + std::vector Vec(Size, 0); + buffer Buf{Vec.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + sycl_ext::work_group_scratch_size static_size(WgSize * RepeatWG * + sizeof(int)); + sycl_ext::properties properties{static_size}; + Cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties, + [=](nd_item<1> Item) { + int *Ptr = reinterpret_cast( + sycl_ext::get_work_group_scratch_memory()); + size_t GroupOffset = + Item.get_group_linear_id() * ElemPerWG; + for (size_t I = 0; I < RepeatWG; ++I) { + Ptr[WgSize * I + Item.get_local_linear_id()] = + Item.get_local_linear_id(); + } + + Item.barrier(); + for (size_t I = 0; I < RepeatWG; ++I) { + // Check that the memory is accessible from other + // work-items + size_t BaseIdx = GroupOffset + (I * WgSize); + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = BaseIdx + LocalIdx; + Acc[GlobalIdx] = Ptr[WgSize * I + LocalIdx]; + } + }); + }); + + host_accessor Acc(Buf, read_only); + for (size_t I = 0; I < Size; ++I) { + assert(Acc[I] == I % WgSize); + } +} diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_unused.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_unused.cpp new file mode 100644 index 0000000000000..e427305c18ed3 --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_unused.cpp @@ -0,0 +1,44 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// + +// UNSUPPORTED: gpu-intel-gen12 +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072 + +#include +#include +#include + +constexpr size_t Size = 1024; +using DataType = int; + +namespace sycl_ext = sycl::ext::oneapi::experimental; + +int main() { + sycl::queue queue; + DataType *a = sycl::malloc_device(Size, queue); + DataType *b = sycl::malloc_device(Size, queue); + std::vector a_host(Size, 1.0); + std::vector b_host(Size, -5.0); + + queue.copy(a_host.data(), a, Size).wait_and_throw(); + + queue + .submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<1>({Size}, {Size}), + sycl_ext::properties{sycl_ext::work_group_scratch_size( + Size * sizeof(DataType))}, + [=](sycl::nd_item<1> it) { + b[it.get_local_linear_id()] = + a[it.get_local_linear_id()]; + }); + }) + .wait_and_throw(); + + queue.copy(b, b_host.data(), Size).wait_and_throw(); + for (size_t i = 0; i < b_host.size(); i++) { + assert(b_host[i] == a_host[i]); + } + sycl::free(a, queue); + sycl::free(b, queue); +} diff --git a/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_block_scope.cpp b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_block_scope.cpp new file mode 100644 index 0000000000000..ea1083c92be52 --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_block_scope.cpp @@ -0,0 +1,89 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// + +#include + +#include + +#include +#include + +constexpr size_t WgSize = 32; +constexpr size_t WgCount = 4; +constexpr size_t Size = WgSize * WgCount; + +struct Foo { + Foo() = delete; + Foo(int Value, int &Counter) { + for (int I = 0; I < WgSize; ++I) + Values[I] = Value; + ++Counter; + } + int Values[WgSize]; +}; + +struct Bar { + int Value = 42; +}; + +class KernelA; +class KernelB; +class KernelC; + +using namespace sycl; + +int main() { + queue Q; + + { + std::vector Vec(Size, 0); + buffer Buf{Vec.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.parallel_for( + nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) { + sycl::ext::oneapi::experimental::work_group_static + localIDBuff; + localIDBuff[Item.get_local_linear_id()] = + Item.get_local_linear_id(); + + Item.barrier(); + // Check that the memory is accessible from other work-items + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = Item.get_global_linear_id() ^ 1; + Acc[GlobalIdx] = localIDBuff[LocalIdx]; + }); + }); + + host_accessor Acc(Buf, read_only); + for (size_t I = 0; I < Size; ++I) + assert(Acc[I] == I % WgSize); + } + + { + std::vector Vec(Size, 0); + buffer Buf{Vec.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.parallel_for( + nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) { + sycl::ext::oneapi::experimental::work_group_static localIDBuff; + int id = Item.get_global_linear_id(); + if (Item.get_group().leader()) + localIDBuff = id; + + Item.barrier(); + // Check that the memory is accessible from other work-items + size_t GlobalIdx = Item.get_global_linear_id(); + Acc[GlobalIdx] = localIDBuff; + }); + }); + + host_accessor Acc(Buf, read_only); + for (size_t I = 0; I < Size; ++I) + assert(Acc[I] == (I / WgSize) * WgSize); + } +} diff --git a/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_class_scope.cpp b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_class_scope.cpp new file mode 100644 index 0000000000000..cfcde25996679 --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_class_scope.cpp @@ -0,0 +1,95 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// + +#include + +#include + +#include +#include + +constexpr size_t WgSize = 32; +constexpr size_t WgCount = 4; +constexpr size_t Size = WgSize * WgCount; + +struct Foo { + Foo() = delete; + Foo(int Value, int &Counter) { + for (int I = 0; I < WgSize; ++I) + Values[I] = Value; + ++Counter; + } + int Values[WgSize]; +}; + +struct Bar { + int Value = 42; +}; + +class KernelA; +class KernelB; +class KernelC; + +using namespace sycl; + +struct LocalMem { + // Local mem used in kernel + static sycl::ext::oneapi::experimental::work_group_static + localIDBuff; +}; +sycl::ext::oneapi::experimental::work_group_static + LocalMem::localIDBuff; + +int main() { + queue Q; + + { + std::vector Vec(Size, 0); + buffer Buf{Vec.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.parallel_for( + nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) { + LocalMem::localIDBuff[Item.get_local_linear_id()] = + Item.get_local_linear_id(); + + Item.barrier(); + // Check that the memory is accessible from other work-items + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = Item.get_global_linear_id() ^ 1; + Acc[GlobalIdx] = LocalMem::localIDBuff[LocalIdx]; + }); + }); + + host_accessor Acc(Buf, read_only); + for (size_t I = 0; I < Size; ++I) + assert(Acc[I] == I % WgSize); + } + + { + std::vector Vec(Size, 0); + buffer Buf{Vec.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.parallel_for( + nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) { + sycl::ext::oneapi::experimental::work_group_static localIDBuff; + int id = Item.get_global_linear_id(); + if (Item.get_group().leader()) + localIDBuff = id; + + Item.barrier(); + // Check that the memory is accessible from other work-items + size_t GlobalIdx = Item.get_global_linear_id(); + Acc[GlobalIdx] = localIDBuff; + }); + }); + + host_accessor Acc(Buf, read_only); + for (size_t I = 0; I < Size; ++I) + assert(Acc[I] == (I / WgSize) * WgSize); + } +} diff --git a/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_namespace_scope.cpp b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_namespace_scope.cpp new file mode 100644 index 0000000000000..15996f8753544 --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_namespace_scope.cpp @@ -0,0 +1,90 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// + +#include + +#include + +#include +#include + +constexpr size_t WgSize = 32; +constexpr size_t WgCount = 4; +constexpr size_t Size = WgSize * WgCount; + +struct Foo { + Foo() = delete; + Foo(int Value, int &Counter) { + for (int I = 0; I < WgSize; ++I) + Values[I] = Value; + ++Counter; + } + int Values[WgSize]; +}; + +struct Bar { + int Value = 42; +}; + +class KernelA; +class KernelB; +class KernelC; + +using namespace sycl; + +// Local mem used in kernel +sycl::ext::oneapi::experimental::work_group_static localIDBuff; + +int main() { + queue Q; + + { + std::vector Vec(Size, 0); + buffer Buf{Vec.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.parallel_for( + nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) { + localIDBuff[Item.get_local_linear_id()] = + Item.get_local_linear_id(); + + Item.barrier(); + // Check that the memory is accessible from other work-items + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = Item.get_global_linear_id() ^ 1; + Acc[GlobalIdx] = localIDBuff[LocalIdx]; + }); + }); + + host_accessor Acc(Buf, read_only); + for (size_t I = 0; I < Size; ++I) + assert(Acc[I] == I % WgSize); + } + + { + std::vector Vec(Size, 0); + buffer Buf{Vec.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.parallel_for( + nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) { + sycl::ext::oneapi::experimental::work_group_static localIDBuff; + int id = Item.get_global_linear_id(); + if (Item.get_group().leader()) + localIDBuff = id; + + Item.barrier(); + // Check that the memory is accessible from other work-items + size_t GlobalIdx = Item.get_global_linear_id(); + Acc[GlobalIdx] = localIDBuff; + }); + }); + + host_accessor Acc(Buf, read_only); + for (size_t I = 0; I < Size; ++I) + assert(Acc[I] == (I / WgSize) * WgSize); + } +} diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index a5134a7a524ca..2028de654cfea 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3524,6 +3524,7 @@ _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE _ZN4sycl3_V17handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE _ZN4sycl3_V17handler22setKernelClusterLaunchENS0_5rangeILi3EEEi _ZN4sycl3_V17handler22setKernelIsCooperativeEb +_ZN4sycl3_V17handler21setKernelWorkGroupMemEm _ZN4sycl3_V17handler24GetRangeRoundingSettingsERmS2_S2_ _ZN4sycl3_V17handler24ext_intel_read_host_pipeENS0_6detail11string_viewEPvmb _ZN4sycl3_V17handler24ext_oneapi_memcpy2d_implEPvmPKvmmm diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index a6e6a5e47c137..ff2e3706e25dd 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4241,6 +4241,7 @@ ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4StableKernelCacheConfig@123@@Z ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$02@23@H@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z +?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z ?setNDRangeDescriptorPadded@handler@_V1@sycl@@AEAAXV?$range@$02@23@0V?$id@$02@23@H@Z ?setNDRangeDescriptorPadded@handler@_V1@sycl@@AEAAXV?$range@$02@23@V?$id@$02@23@H@Z diff --git a/sycl/test/extensions/work_group_static/work_group_static_negative.cpp b/sycl/test/extensions/work_group_static/work_group_static_negative.cpp new file mode 100644 index 0000000000000..c5ddbe4905439 --- /dev/null +++ b/sycl/test/extensions/work_group_static/work_group_static_negative.cpp @@ -0,0 +1,19 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=warning,note %s + +#include + +#include + +using namespace sycl::ext::oneapi::experimental; + +class InvalidCtorDtor { + InvalidCtorDtor() {} + ~InvalidCtorDtor() {} +}; + +SYCL_EXTERNAL void test(int *p) { + // expected-error-re@sycl/ext/oneapi/work_group_static.hpp:* {{static assertion failed due to requirement {{.+}}: Can only be used with non const and non volatile types}} + sycl::ext::oneapi::experimental::work_group_static b1; + // expected-error-re@sycl/ext/oneapi/work_group_static.hpp:* {{static assertion failed due to requirement {{.+}}: Can only be used with trivially constructible and destructible types}} + sycl::ext::oneapi::experimental::work_group_static b2; +} diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index 60a935d9ae465..ff5e28772f0fb 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -131,6 +131,8 @@ // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp +// CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp +// CHECK-NEXT: detail/sycl_local_mem_builtins.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: kernel_bundle.hpp // CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index 90d95975a0245..452738b8fca86 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -393,6 +393,27 @@ TEST_F(CommandGraphTest, EnqueueCustomCommandCheck) { ASSERT_EQ(ExceptionCode, sycl::errc::invalid); } +// sycl_ext_oneapi_work_group_scratch_memory isn't supported with SYCL graphs +TEST_F(CommandGraphTest, WorkGroupScratchMemoryCheck) { + ASSERT_THROW( + { + try { + Graph.add([&](handler &CGH) { + CGH.parallel_for( + range<1>{1}, + ext::oneapi::experimental::properties{ + ext::oneapi::experimental::work_group_scratch_size( + sizeof(int))}, + [=](item<1> idx) {}); + }); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); +} + TEST_F(CommandGraphTest, MakeEdgeErrors) { // Set up some nodes in the graph auto NodeA = Graph.add( diff --git a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp index 0d920590c6e15..02c007c7d27e7 100644 --- a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp +++ b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp @@ -140,7 +140,7 @@ class MockHandler : public sycl::handler { CGH->MKernelName.c_str(), std::move(CGH->MStreamStorage), std::move(impl->MAuxiliaryResources), impl->MCGType, {}, impl->MKernelIsCooperative, impl->MKernelUsesClusterLaunch, - CGH->MCodeLoc)); + impl->MKernelWorkGroupMemorySize, CGH->MCodeLoc)); break; } default: diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 424c9a7dbb58c..ffc0567ba7daa 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -302,7 +302,8 @@ class MockHandlerCustomFinalize : public MockHandler { std::move(impl->MKernelBundle), std::move(CGData), getArgs(), getKernelName(), getStreamStorage(), impl->MAuxiliaryResources, getType(), {}, impl->MKernelIsCooperative, - impl->MKernelUsesClusterLaunch, getCodeLoc())); + impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize, + getCodeLoc())); break; } case sycl::detail::CGType::CodeplayHostTask: { diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index 852d59a43e123..132baecad5448 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -37,7 +37,7 @@ class MockHandlerStreamInit : public MockHandler { getArgs(), getKernelName(), getStreamStorage(), std::move(impl->MAuxiliaryResources), getType(), {}, impl->MKernelIsCooperative, impl->MKernelUsesClusterLaunch, - getCodeLoc())); + impl->MKernelWorkGroupMemorySize, getCodeLoc())); break; } default: