diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 6d74a7b2847f7..36f2093a44b7f 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1667,12 +1667,12 @@ def SYCLIntelESimdVectorize : InheritableAttr {
 }
 
 def SYCLScope : Attr {
-  // No spelling, as this attribute can't be created in the source code.
-  let Spellings = [];
+  let Spellings = [CXX11<"__sycl_detail__", "wg_scope">];
   let Args = [EnumArgument<"level", "Level", /*is_string=*/false,
                                              ["work_group", "work_item"],
-                                             ["WorkGroup", "WorkItem"]>];
-  let Subjects = SubjectList<[Function, Var]>;
+                                             ["WorkGroup", "WorkItem"],
+                                             /*optional=*/true>];
+  let Subjects = SubjectList<[Function, Var, CXXRecord]>;
   let LangOpts = [SYCLIsDevice];
 
   let AdditionalMembers = [{
@@ -1685,7 +1685,7 @@ def SYCLScope : Attr {
     }
   }];
 
-  let Documentation = [InternalOnly];
+  let Documentation = [SYCLWGScopeDocs];
 }
 
 def SYCLDeviceIndirectlyCallable : InheritableAttr {
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 214329b7a4ded..7ad86502be22e 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -4097,6 +4097,18 @@ function pointer for the specified function.
   }];
 }
 
+def SYCLWGScopeDocs : Documentation {
+  let Category = DocCatFunction;
+  let Heading = "__sycl_detail__::wg_scope";
+  let Content = [{
+This attribute can only be applied to records with a trivial default constructor and destructor.
+Types with this attribute cannot be used for non-static data members.
+It indicates that any block and namespace scope variable of a type holding this attribute
+will be allocated in local memory. For variables allocated in block scope, they behave
+as implicitly declared as static.
+  }];
+}
+
 def SYCLDeviceDocs : Documentation {
   let Category = DocCatFunction;
   let Heading = "sycl_device";
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 55f0917a99ac4..817744cc23053 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12547,6 +12547,11 @@ def err_sycl_external_global : Error<
 def warn_sycl_kernel_too_big_args : Warning<
   "size of kernel arguments (%0 bytes) may exceed the supported maximum "
   "of %1 bytes on some devices">, InGroup<SyclStrict>, ShowInSystemHeader;
+def err_sycl_wg_scope : Error<
+  "SYCL work group scope only applies to class with a trivial "
+  "%select{default constructor|destructor}0">;
+def err_sycl_field_with_wg_scope : Error<
+  "non-static data member is of a type with a SYCL work group scope attribute applied to it">;
 def err_sycl_virtual_types : Error<
   "no class with a vtable can be used in a SYCL kernel or any code included in the kernel">;
 def note_sycl_recursive_function_declared_here: Note<"function implemented using recursion declared here">;
diff --git a/clang/include/clang/Sema/SemaSYCL.h b/clang/include/clang/Sema/SemaSYCL.h
index a79d9f55d7736..1efa9e634e317 100644
--- a/clang/include/clang/Sema/SemaSYCL.h
+++ b/clang/include/clang/Sema/SemaSYCL.h
@@ -267,6 +267,7 @@ class SemaSYCL : public SemaBase {
 
   void CheckSYCLKernelCall(FunctionDecl *CallerFunc,
                            ArrayRef<const Expr *> Args);
+  void CheckSYCLScopeAttr(CXXRecordDecl *Decl);
 
   /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current
   /// context is "used as device code".
@@ -478,6 +479,7 @@ class SemaSYCL : public SemaBase {
                                                   const ParsedAttr &AL);
   void handleSYCLIntelMaxWorkGroupsPerMultiprocessor(Decl *D,
                                                      const ParsedAttr &AL);
+  void handleSYCLScopeAttr(Decl *D, const ParsedAttr &AL);
 
   void checkSYCLAddIRAttributesFunctionAttrConflicts(Decl *D);
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 8afe658c3e553..3e5f5005e734a 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -217,7 +217,11 @@ void CodeGenFunction::EmitVarDecl(const VarDecl &D) {
   if (D.getType().getAddressSpace() == LangAS::opencl_local)
     return CGM.getOpenCLRuntime().EmitWorkGroupLocalVarDecl(*this, D);
 
-  if (D.getAttr<SYCLScopeAttr>() && D.getAttr<SYCLScopeAttr>()->isWorkGroup())
+  SYCLScopeAttr *ScopeAttr = D.getAttr<SYCLScopeAttr>();
+  if (!ScopeAttr)
+    if (auto *RD = D.getType()->getAsCXXRecordDecl())
+      ScopeAttr = RD->getAttr<SYCLScopeAttr>();
+  if (ScopeAttr && ScopeAttr->isWorkGroup())
     return CGM.getSYCLRuntime().emitWorkGroupLocalVarDecl(*this, D);
 
   assert(D.hasLocalStorage());
diff --git a/clang/lib/CodeGen/CGSYCLRuntime.cpp b/clang/lib/CodeGen/CGSYCLRuntime.cpp
index bdad6867182f9..6b1abe409c1d1 100644
--- a/clang/lib/CodeGen/CGSYCLRuntime.cpp
+++ b/clang/lib/CodeGen/CGSYCLRuntime.cpp
@@ -96,7 +96,10 @@ void CGSYCLRuntime::emitWorkGroupLocalVarDecl(CodeGenFunction &CGF,
                                               const VarDecl &D) {
 #ifndef NDEBUG
   SYCLScopeAttr *Scope = D.getAttr<SYCLScopeAttr>();
-  assert(Scope && Scope->isWorkGroup() && "work group scope expected");
+  if (!Scope)
+    if (auto *RD = D.getType()->getAsCXXRecordDecl())
+      Scope = RD->getAttr<SYCLScopeAttr>();
+  assert((Scope && Scope->isWorkGroup()) && "work group scope expected");
 #endif // NDEBUG
   // generate global variable in the address space selected by the clang CodeGen
   // (should be local)
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index bbac92cc43b1e..1a662e0164dd5 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -5789,6 +5789,9 @@ LangAS CodeGenModule::GetGlobalVarAddressSpace(const VarDecl *D) {
 
   if (LangOpts.SYCLIsDevice && D) {
     auto *Scope = D->getAttr<SYCLScopeAttr>();
+    if (!Scope)
+      if (auto *RD = D->getType()->getAsCXXRecordDecl())
+        Scope = RD->getAttr<SYCLScopeAttr>();
     if (Scope && Scope->isWorkGroup())
       return LangAS::sycl_local;
   }
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 89029269d93c6..dae8000c91ff3 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1849,7 +1849,9 @@ class DeferredDiagnosticsEmitter
         if (!S.SYCL().checkAllowedSYCLInitializer(VD) &&
             !S.SYCL()
                  .isTypeDecoratedWithDeclAttribute<
-                     SYCLGlobalVariableAllowedAttr>(VD->getType())) {
+                     SYCLGlobalVariableAllowedAttr>(VD->getType()) &&
+            !S.SYCL().isTypeDecoratedWithDeclAttribute<SYCLScopeAttr>(
+                VD->getType())) {
           S.Diag(Loc, diag::err_sycl_restrict)
               << SemaSYCL::KernelConstStaticVariable;
           return;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 879428ce11b86..1572e76abcecf 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -7870,6 +7870,8 @@ NamedDecl *Sema::ActOnVariableDeclarator(
     // attribute.
     if (SCSpec == DeclSpec::SCS_static && !R.isConstant(Context) &&
         !SYCL().isTypeDecoratedWithDeclAttribute<SYCLGlobalVariableAllowedAttr>(
+            NewVD->getType()) &&
+        !SYCL().isTypeDecoratedWithDeclAttribute<SYCLScopeAttr>(
             NewVD->getType()))
       SYCL().DiagIfDeviceCode(D.getIdentifierLoc(), diag::err_sycl_restrict)
           << SemaSYCL::KernelNonConstStaticDataVariable;
@@ -18551,6 +18553,14 @@ FieldDecl *Sema::CheckFieldDecl(DeclarationName Name, QualType T,
     InvalidDecl = true;
   }
 
+  if (LangOpts.SYCLIsDevice) {
+    const CXXRecordDecl *RD = T->getAsCXXRecordDecl();
+    if (RD && RD->hasAttr<SYCLScopeAttr>()) {
+      Diag(Loc, diag::err_sycl_field_with_wg_scope);
+      InvalidDecl = true;
+    }
+  }
+
   if (LangOpts.OpenCL) {
     // OpenCL v1.2 s6.9b,r & OpenCL v2.0 s6.12.5 - The following types cannot be
     // used as structure or union field: image, sampler, event or block types.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 5da515598910c..02a0f306beb80 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6705,6 +6705,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_SYCLDevice:
     S.SYCL().handleSYCLDeviceAttr(D, AL);
     break;
+  case ParsedAttr::AT_SYCLScope:
+    S.SYCL().handleSYCLScopeAttr(D, AL);
+    break;
   case ParsedAttr::AT_SYCLDeviceIndirectlyCallable:
     S.SYCL().handleSYCLDeviceIndirectlyCallableAttr(D, AL);
     break;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index ceea4ac99d236..8994c87baf2fd 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -7218,6 +7218,9 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) {
     else if (Record->hasAttr<CUDADeviceBuiltinTextureTypeAttr>())
       checkCUDADeviceBuiltinTextureClassTemplate(*this, Record);
   }
+  if (getLangOpts().SYCLIsDevice && Record->hasAttr<SYCLScopeAttr>()) {
+    SYCL().CheckSYCLScopeAttr(Record);
+  }
 }
 
 /// Look up the special member function that would be called by a special
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 849e14db9c217..6e31c15331d59 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -235,7 +235,9 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
           VD->getStorageClass() == SC_Static &&
           !VD->hasAttr<SYCLGlobalVarAttr>() &&
           !SemaSYCL::isTypeDecoratedWithDeclAttribute<
-              SYCLGlobalVariableAllowedAttr>(VD->getType()))
+              SYCLGlobalVariableAllowedAttr>(VD->getType()) &&
+          !SemaSYCL::isTypeDecoratedWithDeclAttribute<SYCLScopeAttr>(
+              VD->getType()))
         SYCL().DiagIfDeviceCode(*Locs.begin(), diag::err_sycl_restrict)
             << SemaSYCL::KernelNonConstStaticDataVariable;
       // Non-const globals are not allowed in SYCL except for ESIMD or with the
@@ -243,7 +245,9 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
       else if (IsRuntimeEvaluated && !IsEsimdPrivateGlobal && !IsConst &&
                VD->hasGlobalStorage() && !VD->hasAttr<SYCLGlobalVarAttr>() &&
                !SemaSYCL::isTypeDecoratedWithDeclAttribute<
-                   SYCLGlobalVariableAllowedAttr>(VD->getType()))
+                   SYCLGlobalVariableAllowedAttr>(VD->getType()) &&
+               !SemaSYCL::isTypeDecoratedWithDeclAttribute<SYCLScopeAttr>(
+                   VD->getType()))
         SYCL().DiagIfDeviceCode(*Locs.begin(), diag::err_sycl_restrict)
             << SemaSYCL::KernelGlobalVariable;
       // ESIMD globals cannot be used in a SYCL context.
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index ae2da32a76338..a356343d02cb9 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -5210,6 +5210,28 @@ void SemaSYCL::CheckSYCLKernelCall(FunctionDecl *KernelFunc,
     KernelFunc->setInvalidDecl();
 }
 
+void SemaSYCL::CheckSYCLScopeAttr(CXXRecordDecl *Decl) {
+  assert(Decl->hasAttr<SYCLScopeAttr>());
+
+  bool HasError = false;
+
+  if (Decl->isDependentContext())
+    return;
+
+  // We don't emit both diags at the time as note will only be emitted for the
+  // first, which is confusing. So we check both cases but only report one.
+  if (!Decl->hasTrivialDefaultConstructor()) {
+    Diag(Decl->getLocation(), diag::err_sycl_wg_scope) << 0;
+    HasError = true;
+  } else if (!Decl->hasTrivialDestructor()) {
+    Diag(Decl->getLocation(), diag::err_sycl_wg_scope) << 1;
+    HasError = true;
+  }
+
+  if (HasError)
+    Decl->dropAttr<SYCLScopeAttr>();
+}
+
 // For a wrapped parallel_for, copy attributes from original
 // kernel to wrapped kernel.
 void SemaSYCL::copySYCLKernelAttrs(CXXMethodDecl *CallOperator) {
diff --git a/clang/lib/Sema/SemaSYCLDeclAttr.cpp b/clang/lib/Sema/SemaSYCLDeclAttr.cpp
index db0a2fdce4aad..c2268d9f35b14 100644
--- a/clang/lib/Sema/SemaSYCLDeclAttr.cpp
+++ b/clang/lib/Sema/SemaSYCLDeclAttr.cpp
@@ -3109,6 +3109,21 @@ void SemaSYCL::handleSYCLRegisterNumAttr(Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (Context) SYCLRegisterNumAttr(Context, AL, RegNo));
 }
 
+void SemaSYCL::handleSYCLScopeAttr(Decl *D, const ParsedAttr &AL) {
+  if (!AL.checkExactlyNumArgs(SemaRef, 0))
+    return;
+  if (auto *CRD = dyn_cast<CXXRecordDecl>(D);
+      !CRD || !(CRD->isClass() || CRD->isStruct())) {
+    SemaRef.Diag(AL.getRange().getBegin(),
+                 diag::err_attribute_wrong_decl_type_str)
+        << AL << AL.isRegularKeywordAttribute() << "classes";
+    return;
+  }
+
+  D->addAttr(SYCLScopeAttr::Create(SemaRef.getASTContext(),
+                                   SYCLScopeAttr::Level::WorkGroup, AL));
+}
+
 void SemaSYCL::checkSYCLAddIRAttributesFunctionAttrConflicts(Decl *D) {
   const auto *AddIRFuncAttr = D->getAttr<SYCLAddIRAttributesFunctionAttr>();
 
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index caf210e5c1dab..b0a99c82110e9 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -203,6 +203,7 @@
 // CHECK-NEXT: SYCLIntelSchedulerTargetFmaxMhz (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelUseStallEnableClusters (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLRegisterNum (SubjectMatchRule_variable_is_global)
+// CHECK-NEXT: SYCLScope (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record)
 // CHECK-NEXT: SYCLSimd (SubjectMatchRule_function, SubjectMatchRule_variable_is_global)
 // CHECK-NEXT: SYCLSpecialClass (SubjectMatchRule_record)
 // CHECK-NEXT: SYCLType (SubjectMatchRule_record, SubjectMatchRule_enum)
diff --git a/clang/test/SemaSYCL/sycl_wg_scope.cpp b/clang/test/SemaSYCL/sycl_wg_scope.cpp
new file mode 100644
index 0000000000000..51f9616d47528
--- /dev/null
+++ b/clang/test/SemaSYCL/sycl_wg_scope.cpp
@@ -0,0 +1,94 @@
+// Verify the use of wg_scope is correctly diagnosed.
+// RUN: %clang_cc1 -fsycl-is-device -verify %s
+
+class [[__sycl_detail__::wg_scope]] G1 {};
+class [[__sycl_detail__::wg_scope]] G2 {
+  G2() = default;
+  G2(int i) : i(i) {}
+  int i;
+};
+
+class [[__sycl_detail__::wg_scope]] G3 {
+  ~G3() = default;
+};
+
+class [[__sycl_detail__::wg_scope]] B4 { // expected-error {{SYCL work group scope only applies to class with a trivial default constructor}}
+  B4() {}
+};
+
+class [[__sycl_detail__::wg_scope]] B5 { // expected-error {{SYCL work group scope only applies to class with a trivial destructor}}
+  ~B5() {}
+};
+
+class [[__sycl_detail__::wg_scope]] B6 { // expected-error {{SYCL work group scope only applies to class with a trivial default constructor}}
+  B6() {}
+  ~B6() {}
+};
+
+template <typename T> class [[__sycl_detail__::wg_scope]] B7 { // #B7
+public:
+  T obj;
+};
+
+struct Valid {};
+struct InvalidCtor {
+  InvalidCtor() {}
+};
+struct InvalidDtor {
+  ~InvalidDtor() {}
+};
+struct InvalidCDtor {
+  InvalidCDtor() {}
+  ~InvalidCDtor() {}
+};
+
+B7<Valid> b7;
+// expected-error@#B7 {{SYCL work group scope only applies to class with a trivial default constructor}}
+// expected-note@+1 {{in instantiation of template class 'B7<InvalidCtor>' requested here}}
+B7<InvalidCtor> b9;
+// expected-error@#B7 {{SYCL work group scope only applies to class with a trivial destructor}}
+// expected-note@+1 {{in instantiation of template class 'B7<InvalidDtor>' requested here}}
+B7<InvalidDtor> b10;
+// expected-error@#B7 {{SYCL work group scope only applies to class with a trivial default constructor}}
+// expected-note@+1 {{in instantiation of template class 'B7<InvalidCDtor>' requested here}}
+B7<InvalidCDtor> b11;
+
+template <typename T> class [[__sycl_detail__::wg_scope]] B12 { // #B12
+public:
+  B12() = default;
+  ~B12() = default;
+  T obj;
+};
+
+B12<Valid> b12;
+// expected-error@#B12 {{SYCL work group scope only applies to class with a trivial default constructor}}
+// expected-note@+1 {{in instantiation of template class 'B12<InvalidCtor>' requested here}}
+B12<InvalidCtor> b13;
+
+class B14 {
+  G1 field; // expected-error {{non-static data member is of a type with a SYCL work group scope attribute applied to it}}
+};
+
+template <typename T> class B15 {
+  T field; // #B15-field
+};
+
+// expected-error@#B15-field {{non-static data member is of a type with a SYCL work group scope attribute applied to it}}
+// expected-note@+1 {{in instantiation of template class 'B15<G1>' requested here}}
+B15<G1> b15;
+
+G1 g16;
+static G1 g17;
+
+struct Wrap {
+  static G1 g18;
+};
+
+__attribute__((sycl_device)) void ref_func() {
+  G1 g19;
+  static G1 g20;
+
+  (void)g16;
+  (void)g17;
+  (void)Wrap::g18;
+}
diff --git a/llvm/include/llvm/SYCLLowerIR/LowerWGLocalMemory.h b/llvm/include/llvm/SYCLLowerIR/LowerWGLocalMemory.h
index cdb8e6dff0218..62befadd417cf 100644
--- a/llvm/include/llvm/SYCLLowerIR/LowerWGLocalMemory.h
+++ b/llvm/include/llvm/SYCLLowerIR/LowerWGLocalMemory.h
@@ -47,6 +47,11 @@ class SYCLLowerWGLocalMemoryPass
 ModulePass *createSYCLLowerWGLocalMemoryLegacyPass();
 void initializeSYCLLowerWGLocalMemoryLegacyPass(PassRegistry &);
 
+namespace sycl {
+std::vector<std::pair<StringRef, int>>
+getKernelNamesUsingImplicitLocalMem(const Module &M);
+}
+
 } // namespace llvm
 
 #endif // LLVM_SYCLLOWERIR_LOWERWGLOCALMEMORY_H
diff --git a/llvm/include/llvm/Support/PropertySetIO.h b/llvm/include/llvm/Support/PropertySetIO.h
index 13cb687f3b08b..201a93d19e7bc 100644
--- a/llvm/include/llvm/Support/PropertySetIO.h
+++ b/llvm/include/llvm/Support/PropertySetIO.h
@@ -210,6 +210,7 @@ class PropertySetRegistry {
   static constexpr char SYCL_DEVICE_REQUIREMENTS[] = "SYCL/device requirements";
   static constexpr char SYCL_HOST_PIPES[] = "SYCL/host pipes";
   static constexpr char SYCL_VIRTUAL_FUNCTIONS[] = "SYCL/virtual functions";
+  static constexpr char SYCL_IMPLICIT_LOCAL_ARG[] = "SYCL/implicit local arg";
 
   /// Function for bulk addition of an entire property set in the given
   /// \p Category .
diff --git a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
index cfea28538017c..c61728bc511fb 100644
--- a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
+++ b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
@@ -16,6 +16,7 @@
 #include "llvm/SYCLLowerIR/CompileTimePropertiesPass.h"
 #include "llvm/SYCLLowerIR/DeviceGlobals.h"
 #include "llvm/SYCLLowerIR/HostPipes.h"
+#include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
 #include "llvm/SYCLLowerIR/ModuleSplitter.h"
 #include "llvm/SYCLLowerIR/SYCLDeviceLibReqMask.h"
 #include "llvm/SYCLLowerIR/SYCLKernelParamOptInfo.h"
@@ -388,6 +389,13 @@ PropSetRegTy computeModuleProperties(const Module &M,
     for (const StringRef &FName : FuncNames)
       PropSet.add(PropSetRegTy::SYCL_ASSERT_USED, FName, true);
   }
+  {
+    std::vector<std::pair<StringRef, int>> ArgPos =
+        getKernelNamesUsingImplicitLocalMem(M);
+    for (const auto &FuncAndArgPos : ArgPos)
+      PropSet.add(PropSetRegTy::SYCL_IMPLICIT_LOCAL_ARG, FuncAndArgPos.first,
+                  FuncAndArgPos.second);
+  }
 
   {
     if (isModuleUsingAsan(M))
diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp
index 1ca82ae078df0..bb17dc3bc69cd 100644
--- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp
+++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp
@@ -13,13 +13,20 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Pass.h"
+#include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "LowerWGLocalMemory"
 
 static constexpr char SYCL_ALLOCLOCALMEM_CALL[] = "__sycl_allocateLocalMemory";
+static constexpr char SYCL_DYNAMIC_LOCALMEM_CALL[] =
+    "__sycl_dynamicLocalMemoryPlaceholder";
 static constexpr char LOCALMEMORY_GV_PREF[] = "WGLocalMem";
+static constexpr char DYNAMIC_LOCALMEM_GV[] =
+    "__sycl_dynamicLocalMemoryPlaceholder_GV";
+static constexpr char WORK_GROUP_STATIC_ATTR[] = "sycl-work-group-static";
+static constexpr char WORK_GROUP_STATIC_ARG_ATTR[] = "sycl-implicit-local-arg";
 
 namespace {
 class SYCLLowerWGLocalMemoryLegacy : public ModulePass {
@@ -42,6 +49,31 @@ class SYCLLowerWGLocalMemoryLegacy : public ModulePass {
 };
 } // namespace
 
+std::vector<std::pair<StringRef, int>>
+sycl::getKernelNamesUsingImplicitLocalMem(const Module &M) {
+  std::vector<std::pair<StringRef, int>> SPIRKernelNames;
+  Triple TT(M.getTargetTriple());
+
+  if (TT.isSPIROrSPIRV()) {
+    auto GetArgumentPos = [&](const Function &F) -> int {
+      for (const Argument &Arg : F.args())
+        if (F.getAttributes().hasParamAttr(Arg.getArgNo(),
+                                           WORK_GROUP_STATIC_ARG_ATTR))
+          return Arg.getArgNo();
+      // Not lowered to an implicit arg or DAE.
+      return -1;
+    };
+    llvm::for_each(M.functions(), [&](const Function &F) {
+      if (F.getCallingConv() == CallingConv::SPIR_KERNEL &&
+          F.hasFnAttribute(WORK_GROUP_STATIC_ATTR)) {
+        int ArgPos = GetArgumentPos(F);
+        SPIRKernelNames.emplace_back(F.getName(), ArgPos);
+      }
+    });
+  }
+  return SPIRKernelNames;
+}
+
 char SYCLLowerWGLocalMemoryLegacy::ID = 0;
 INITIALIZE_PASS(SYCLLowerWGLocalMemoryLegacy, "sycllowerwglocalmemory",
                 "Replace __sycl_allocateLocalMemory with allocation of memory "
@@ -90,17 +122,27 @@ static void lowerAllocaLocalMemCall(CallInst *CI, Module &M) {
   CI->replaceAllUsesWith(GVPtr);
 }
 
-static bool allocaWGLocalMemory(Module &M) {
-  Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL);
-  if (!ALMFunc)
-    return false;
+static void
+lowerDynamicLocalMemCallDirect(CallInst *CI, Triple TT,
+                               GlobalVariable *LocalMemPlaceholder) {
+  assert(CI);
 
-  assert(ALMFunc->isDeclaration() && "should have declaration only");
+  Value *GVPtr = [&]() -> Value * {
+    IRBuilder<> Builder(CI);
+    if (TT.isSPIROrSPIRV())
+      return Builder.CreateLoad(CI->getType(), LocalMemPlaceholder);
 
+    return Builder.CreatePointerCast(LocalMemPlaceholder, CI->getType());
+  }();
+  CI->replaceAllUsesWith(GVPtr);
+}
+
+static void lowerLocalMemCall(Function *LocalMemAllocFunc,
+                              std::function<void(CallInst *CI)> TransformCall) {
   SmallVector<CallInst *, 4> DelCalls;
-  for (User *U : ALMFunc->users()) {
+  for (User *U : LocalMemAllocFunc->users()) {
     auto *CI = cast<CallInst>(U);
-    lowerAllocaLocalMemCall(CI, M);
+    TransformCall(CI);
     DelCalls.push_back(CI);
   }
 
@@ -110,15 +152,173 @@ static bool allocaWGLocalMemory(Module &M) {
   }
 
   // Remove __sycl_allocateLocalMemory declaration.
-  assert(ALMFunc->use_empty() && "__sycl_allocateLocalMemory is still in use");
-  ALMFunc->eraseFromParent();
+  assert(LocalMemAllocFunc->use_empty() &&
+         "local mem allocation function is still in use");
+  LocalMemAllocFunc->eraseFromParent();
+}
+
+static bool allocaWGLocalMemory(Module &M) {
+  Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL);
+  if (!ALMFunc)
+    return false;
+
+  assert(ALMFunc->isDeclaration() && "should have declaration only");
+
+  lowerLocalMemCall(ALMFunc,
+                    [&](CallInst *CI) { lowerAllocaLocalMemCall(CI, M); });
+
+  return true;
+}
+
+// For dynamic memory we have 2 case:
+//   - Direct for CUDA/HIP: we create a placeholder and set the memory on
+//   launch.
+//   - Indirect for OpenCL/Level0: we create a shared value holding the pointer
+//   to the buffer passed as argument.
+static bool dynamicWGLocalMemory(Module &M) {
+  Function *DLMFunc = M.getFunction(SYCL_DYNAMIC_LOCALMEM_CALL);
+  if (!DLMFunc)
+    return false;
+
+  GlobalVariable *LocalMemArrayGV =
+      M.getGlobalVariable(DYNAMIC_LOCALMEM_GV, true);
+  Triple TT(M.getTargetTriple());
+  unsigned LocalAS = DLMFunc->getReturnType()->getPointerAddressSpace();
+
+  if (!LocalMemArrayGV) {
+    assert(DLMFunc->isDeclaration() && "should have declaration only");
+    Type *LocalMemArrayTy =
+        TT.isSPIROrSPIRV()
+            ? static_cast<Type *>(PointerType::get(M.getContext(), LocalAS))
+            : static_cast<Type *>(
+                  ArrayType::get(Type::getInt8Ty(M.getContext()), 0));
+    LocalMemArrayGV = new GlobalVariable(
+        M,               // module
+        LocalMemArrayTy, // type
+        false,           // isConstant
+        TT.isSPIROrSPIRV() ? GlobalValue::LinkOnceODRLinkage
+                           : GlobalValue::ExternalLinkage, // Linkage
+        TT.isSPIROrSPIRV() ? UndefValue::get(LocalMemArrayTy)
+                           : nullptr,   // Initializer
+        DYNAMIC_LOCALMEM_GV,            // Name prefix
+        nullptr,                        // InsertBefore
+        GlobalVariable::NotThreadLocal, // ThreadLocalMode
+        LocalAS                         // AddressSpace
+    );
+    constexpr int DefaultMaxAlignment = 128;
+    if (!TT.isSPIROrSPIRV())
+      LocalMemArrayGV->setAlignment(Align{DefaultMaxAlignment});
+  }
+  lowerLocalMemCall(DLMFunc, [&](CallInst *CI) {
+    lowerDynamicLocalMemCallDirect(CI, TT, LocalMemArrayGV);
+  });
+  if (TT.isSPIROrSPIRV()) {
+    SmallVector<Function *, 4> Kernels;
+    llvm::for_each(M.functions(), [&](Function &F) {
+      if (F.getCallingConv() == CallingConv::SPIR_KERNEL &&
+          F.hasFnAttribute(WORK_GROUP_STATIC_ATTR)) {
+        Kernels.push_back(&F);
+      }
+    });
+    for (Function *OldKernel : Kernels) {
+      FunctionType *FuncTy = OldKernel->getFunctionType();
+      const AttributeList &FuncAttrs = OldKernel->getAttributes();
+      Type *ImplicitLocalPtr = PointerType::get(M.getContext(), LocalAS);
+
+      // Construct an argument list containing all of the previous arguments.
+      SmallVector<Type *, 8> Arguments;
+      SmallVector<AttributeSet, 8> ArgumentAttributes;
+      for (const auto &I : enumerate(OldKernel->args())) {
+        Arguments.push_back(I.value().getType());
+        ArgumentAttributes.push_back(FuncAttrs.getParamAttrs(I.index()));
+      }
+
+      Arguments.push_back(ImplicitLocalPtr);
+      ArgumentAttributes.push_back(AttributeSet::get(
+          M.getContext(),
+          ArrayRef<Attribute>{
+              Attribute::get(M.getContext(), Attribute::NoAlias),
+              Attribute::get(M.getContext(), WORK_GROUP_STATIC_ARG_ATTR)}));
+
+      // Build the new function.
+      AttributeList NAttrs =
+          AttributeList::get(OldKernel->getContext(), FuncAttrs.getFnAttrs(),
+                             FuncAttrs.getRetAttrs(), ArgumentAttributes);
+      assert(!FuncTy->isVarArg() && "Variadic arguments prohibited in SYCL");
+      FunctionType *NewFuncTy = FunctionType::get(
+          FuncTy->getReturnType(), Arguments, FuncTy->isVarArg());
+
+      Function *NewFunc = Function::Create(NewFuncTy, OldKernel->getLinkage(),
+                                           OldKernel->getAddressSpace());
+
+      // Keep original function ordering.
+      M.getFunctionList().insertAfter(OldKernel->getIterator(), NewFunc);
+
+      NewFunc->copyAttributesFrom(OldKernel);
+      NewFunc->setComdat(OldKernel->getComdat());
+      NewFunc->setAttributes(NAttrs);
+      NewFunc->takeName(OldKernel);
+
+      // Splice the body of the old function right into the new function.
+      NewFunc->splice(NewFunc->begin(), OldKernel);
+
+      for (Function::arg_iterator FuncArg = OldKernel->arg_begin(),
+                                  FuncEnd = OldKernel->arg_end(),
+                                  NewFuncArg = NewFunc->arg_begin();
+           FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) {
+        FuncArg->replaceAllUsesWith(NewFuncArg);
+      }
+
+      // Clone metadata of the old function, including debug info descriptor.
+      SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+      OldKernel->getAllMetadata(MDs);
+      for (const auto &MD : MDs)
+        NewFunc->addMetadata(MD.first, *MD.second);
+      // Store the pointer to the implicit local memory into the global
+      // handler.
+      IRBuilder<> Builder(&NewFunc->getEntryBlock(),
+                          NewFunc->getEntryBlock().getFirstNonPHIIt());
+      Builder.CreateStore(NewFunc->getArg(NewFunc->arg_size() - 1),
+                          LocalMemArrayGV);
+      OldKernel->eraseFromParent();
+      auto FixupMetadata = [&](StringRef MDName, Metadata *NewV) {
+        auto *Node = NewFunc->getMetadata(MDName);
+        if (!Node)
+          return;
+        SmallVector<Metadata *, 8> NewMD(Node->operands());
+        NewMD.emplace_back(NewV);
+        NewFunc->setMetadata(MDName,
+                             llvm::MDNode::get(NewFunc->getContext(), NewMD));
+      };
+
+      FixupMetadata("kernel_arg_buffer_location",
+                    ConstantAsMetadata::get(Builder.getInt32(-1)));
+      FixupMetadata("kernel_arg_runtime_aligned",
+                    ConstantAsMetadata::get(Builder.getFalse()));
+      FixupMetadata("kernel_arg_exclusive_ptr",
+                    ConstantAsMetadata::get(Builder.getFalse()));
+
+      FixupMetadata("kernel_arg_addr_space",
+                    ConstantAsMetadata::get(Builder.getInt32(LocalAS)));
+      FixupMetadata("kernel_arg_access_qual",
+                    MDString::get(M.getContext(), "read_write"));
+      FixupMetadata("kernel_arg_type", MDString::get(M.getContext(), "void*"));
+      FixupMetadata("kernel_arg_base_type",
+                    MDString::get(M.getContext(), "void*"));
+      FixupMetadata("kernel_arg_type_qual", MDString::get(M.getContext(), ""));
+      FixupMetadata("kernel_arg_accessor_ptr",
+                    ConstantAsMetadata::get(Builder.getFalse()));
+    }
+  }
 
   return true;
 }
 
 PreservedAnalyses SYCLLowerWGLocalMemoryPass::run(Module &M,
                                                   ModuleAnalysisManager &) {
-  if (allocaWGLocalMemory(M))
+  bool MadeChanges = allocaWGLocalMemory(M);
+  MadeChanges = dynamicWGLocalMemory(M) || MadeChanges;
+  if (MadeChanges)
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Support/PropertySetIO.cpp b/llvm/lib/Support/PropertySetIO.cpp
index 2fe7cac00fb14..ca8b014c97a0e 100644
--- a/llvm/lib/Support/PropertySetIO.cpp
+++ b/llvm/lib/Support/PropertySetIO.cpp
@@ -207,6 +207,7 @@ constexpr char PropertySetRegistry::SYCL_DEVICE_GLOBALS[];
 constexpr char PropertySetRegistry::SYCL_DEVICE_REQUIREMENTS[];
 constexpr char PropertySetRegistry::SYCL_HOST_PIPES[];
 constexpr char PropertySetRegistry::SYCL_VIRTUAL_FUNCTIONS[];
+constexpr char PropertySetRegistry::SYCL_IMPLICIT_LOCAL_ARG[];
 
 } // namespace util
 } // namespace llvm
diff --git a/llvm/test/SYCLLowerIR/work_group_static.ll b/llvm/test/SYCLLowerIR/work_group_static.ll
new file mode 100644
index 0000000000000..105bb270f3450
--- /dev/null
+++ b/llvm/test/SYCLLowerIR/work_group_static.ll
@@ -0,0 +1,42 @@
+; RUN: opt -S -sycllowerwglocalmemory -bugpoint-enable-legacy-pm < %s | FileCheck %s
+; RUN: opt -S -passes=sycllowerwglocalmemory < %s | FileCheck %s
+
+; CHECK-NOT: __sycl_dynamicLocalMemoryPlaceholder
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: @__sycl_dynamicLocalMemoryPlaceholder_GV = linkonce_odr addrspace(3) global ptr addrspace(3) undef
+
+; Function Attrs: convergent norecurse
+; CHECK: @_ZTS7KernelA(ptr addrspace(1) %0, ptr addrspace(3) noalias "sycl-implicit-local-arg" %[[IMPLICT_ARG:[a-zA-Z0-9]+]]{{.*}} !kernel_arg_addr_space ![[ADDR_SPACE_MD:[0-9]+]]
+define weak_odr dso_local spir_kernel void @_ZTS7KernelA(ptr addrspace(1) %0) local_unnamed_addr #0 !kernel_arg_addr_space !5 {
+entry:
+; CHECK: store ptr addrspace(3) %[[IMPLICT_ARG]], ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
+; CHECK: %[[LD1:[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
+  %1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1
+; CHECK: getelementptr inbounds i8, ptr addrspace(3) %[[LD1]]
+  %2 = getelementptr inbounds i8, ptr addrspace(3) %1, i64 4
+; CHECK: %[[LD2:[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
+  %3 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 4) #1
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64) local_unnamed_addr #1
+
+attributes #0 = { convergent norecurse "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" "sycl-work-group-static"="1" }
+attributes #1 = { convergent norecurse }
+
+!llvm.module.flags = !{!0}
+!opencl.spir.version = !{!1}
+!spirv.Source = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 4, i32 100000}
+!3 = !{!"clang version 13.0.0"}
+!4 = !{}
+; ![[ADDR_SPACE_MD]] = !{i32 1, i32 3}
+!5 = !{i32 1}
diff --git a/llvm/test/SYCLLowerIR/work_group_static_nv.ll b/llvm/test/SYCLLowerIR/work_group_static_nv.ll
new file mode 100644
index 0000000000000..cc957e45ea0a8
--- /dev/null
+++ b/llvm/test/SYCLLowerIR/work_group_static_nv.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -sycllowerwglocalmemory -bugpoint-enable-legacy-pm < %s | FileCheck %s
+; RUN: opt -S -passes=sycllowerwglocalmemory < %s | FileCheck %s
+
+; CHECK-NOT: __sycl_dynamicLocalMemoryPlaceholder
+
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK: @__sycl_dynamicLocalMemoryPlaceholder_GV = external addrspace(3) global [0 x i8], align 128
+
+; Function Attrs: convergent norecurse
+; CHECK: @_ZTS7KernelA(ptr addrspace(1) %0)
+define void @_ZTS7KernelA(ptr addrspace(1) %0) local_unnamed_addr #0 !kernel_arg_addr_space !5 {
+entry:
+; CHECK: getelementptr inbounds i8, ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder_GV
+  %1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1
+  %2 = getelementptr inbounds i8, ptr addrspace(3) %1, i64 4
+  %3 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 4) #1
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64) local_unnamed_addr #1
+
+attributes #0 = { convergent norecurse "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" "sycl-work-group-static"="1" }
+attributes #1 = { convergent norecurse }
+
+!llvm.module.flags = !{!0}
+!opencl.spir.version = !{!1}
+!spirv.Source = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 4, i32 100000}
+!3 = !{!"clang version 13.0.0"}
+!4 = !{}
+; ![[ADDR_SPACE_MD]] = !{i32 1, i32 3}
+!5 = !{i32 1}
diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
index 72841724fa01d..e9d52c5b23dac 100644
--- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake
+++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
@@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
       CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
   endfunction()
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime")
   include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)
 
   set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake
index 81e0ae3c1fc3a..df49c8f4d806b 100644
--- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake
+++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake
@@ -1,7 +1,7 @@
-# commit 2bea25d7d5404ce82f36caf91c359f65b25187d7
-# Merge: 0b5d8f9e e578228a
-# Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-# Date:   Tue Dec 3 12:41:16 2024 +0000
-#     Merge pull request #2298 from Bensuo/ewan/cuda_update_local_size
-#     Improve CUDA/HIP local argument handling
-set(UNIFIED_RUNTIME_TAG 2bea25d7d5404ce82f36caf91c359f65b25187d7)
+# commit 5acc8240aefc020112cc69c0769ea62f8b64c6d7
+# Merge: 7ee29785cf0 41ad797c3993
+# Author: Martin Grant <martin.morrisongrant@codeplay.com>
+# Date:   Wed Dec 4 11:52:09 2024 +0000
+#     Merge pull request #2403 from Naghasan/work_group_static_reland
+#     Add new launch property to support work_group_scratch_memory
+set(UNIFIED_RUNTIME_TAG 5acc8240aefc020112cc69c0769ea62f8b64c6d7)
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
index a5596b6e96158..4313f8aafa84a 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
@@ -1986,6 +1986,16 @@ e.g. directly to a queue without returning an event.
 link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels]
 can be used with SYCL Graphs. 
 
+==== sycl_ext_oneapi_work_group_scratch_memory
+
+The new property defined by
+link:../experimental/sycl_ext_oneapi_work_group_scratch_memory.asciidoc[sycl_ext_oneapi_work_group_scratch_memory]
+cannot be used in graph nodes. A synchronous exception will be thrown with error
+code `invalid` if a user tries to add them to a graph.
+
+Removing this restriction is something we may look at for future revisions of
+`sycl_ext_oneapi_graph`.
+
 == Examples and Usage Guide
 
 Detailed code examples and usage guidelines are provided in the
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_scratch_memory.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_scratch_memory.asciidoc
new file mode 100644
index 0000000000000..757406b068fa4
--- /dev/null
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_scratch_memory.asciidoc
@@ -0,0 +1,188 @@
+= sycl_ext_oneapi_work_group_scratch_memory
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2024 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 9 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+The following extensions are required:
+
+- link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties]
+
+- link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_kernel_properties]
+
+
+== Status
+
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback.  Interfaces defined in this
+specification are implemented in {dpcpp}, but they are not finalized and may
+change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in this
+specification.*
+
+
+== Overview
+
+This extension adds a way to allocate device local memory, without explicitly passing a
+kernel argument: `get_work_group_scratch_memory`. It provides access to a dynamically sized
+buffer without passing it as an argument to the kernel.
+Device local memory is memory that is shared by all work-items in a work-group.
+The behavior is similar to the usage of unbounded array with the CUDA `+__shared__+` keyword.
+
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY` to one of the values defined in the
+table below.  Applications can test for the existence of this macro to
+determine if the implementation supports this feature, or applications can test
+the macro's value to determine which of the extension's features the
+implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
+
+=== `get_work_group_scratch_memory` function
+
+The `get_work_group_scratch_memory` function provides access
+to a dynamically allocated buffer in the device local memory.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+  void* get_work_group_scratch_memory();
+}
+----
+
+_Returns_: A pointer to a dynamically allocated buffer
+           in the device local memory.
+
+The size of the allocation is unknown at compile-time,
+and must be communicated to the SYCL implementation via the
+`work_group_scratch_size` property. Every call to
+`get_work_group_scratch_memory` returns the same allocation
+in device local memory.
+
+=== Kernel properties
+
+The `work_group_scratch_size` property must be passed to a kernel to determine
+the run-time size of the device local memory allocation associated with
+all `get_work_group_scratch_memory` calls.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+struct work_group_scratch_size {
+  constexpr work_group_scratch_size(size_t bytes) : value(bytes) {}
+  size_t value;
+}; // work_group_scratch_size
+
+using work_group_scratch_size_key = work_group_scratch_size;
+
+template <> struct is_property_key<work_group_scratch_size_key> : std::true_type {};
+
+} // namespace sycl::ext::oneapi::experimental
+----
+
+|===
+|Property|Description
+
+|`work_group_scratch_size`
+|The `work_group_scratch_size` property describes the amount of dynamic
+device local memory required by the kernel in bytes.
+
+|===
+
+=== Total allocation check
+
+If the total amount of device local memory requested exceeds a device's
+local memory capacity as reported by `info::device::local_mem_size`
+then the implementation must throw a synchronous exception with the
+`errc::memory_allocation` error code from the kernel invocation command
+(e.g. `parallel_for`). This check must take all APIs that allocation device
+local memory into account, whether via the `work_group_scratch_size` property
+or other APIs such as `local_accessor`.
+
+== Example
+
+[source,c++]
+----
+namespace syclex = sycl::ext::oneapi::experimental;
+
+...
+
+q.parallel_for(sycl::nd_range<1>{N, M},
+  syclex::properties{syclex::work_group_scratch_size(M * sizeof(int))},
+  [=](sycl::nd_item<1> it) {
+ auto ptr = syclex::get_work_group_scratch_memory();
+ auto ptr2 = syclex::get_work_group_scratch_memory();
+});
+----
+
+
+== Implementation notes
+
+This non-normative section provides information about one possible
+implementation of this extension.  It is not part of the specification of the
+extension's API.
+
+For `get_work_group_scratch_memory`,
+the implementation may need to generate some additional code to
+appropriately initialize the pointer(s) returned by the call.
+Alternatively, it may be possible to initialize the pointer to the beginning
+of the device's local memory region (if that value is known). Either way, the
+implementation must account for the existence of one or more `local_accessor`
+objects (which themselves may allocate a dynamic amount of device local
+memory).
+
+
+== Issues
+
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_static.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_static.asciidoc
new file mode 100644
index 0000000000000..709c890ffdd86
--- /dev/null
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_static.asciidoc
@@ -0,0 +1,236 @@
+= sycl_ext_oneapi_work_group_static
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2023 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 9 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+
+== Status
+
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback.  Interfaces defined in this
+specification are implemented in {dpcpp}, but they are not finalized and may
+change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in this
+specification.*
+
+== Overview
+
+This extension adds a way to allocate device local memory, without passing a
+kernel argument: `work_group_static`.
+Device local memory is memory that is shared by all work-items in a work-group.
+The behavior is similar to the CUDA `+__shared__+` keyword, and the extension
+draws some inspiration from the {cpp} `thread_local` keyword.
+
+`work_group_static` can only be used to declare variables at namespace, block or class scope,
+lifting many of the restrictions in the existing
+link:../supported/sycl_ext_oneapi_local_memory.asciidoc[sycl_ext_oneapi_local_memory]
+extension. Note, however, that `work_group_static` variables currently place
+additional limits on the types that can be allocated, owing to differences in
+constructor behavior.
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_WORK_GROUP_STATIC` to one of the values defined in the
+table below.  Applications can test for the existence of this macro to
+determine if the implementation supports this feature, or applications can test
+the macro's value to determine which of the extension's features the
+implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
+
+=== `work_group_static` class template
+
+The `work_group_static` class template provides storage of
+an object into device local memory.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+template <typename T>
+class work_group_static final {
+public:
+
+  work_group_static() = default;
+  work_group_static(const work_group_static&) = delete;
+  work_group_static& operator=(const work_group_static&) = delete;
+
+  operator T&() const noexcept;
+
+  // Available only if: std::is_array_v<T> == false
+  const work_group_static& operator=(const T& value) const noexcept;
+
+  T* operator&() const noexcept;
+};
+
+} // namespace sycl::ext::oneapi::experimental
+----
+
+`T` must be cv-unqualified, trivially constructible and trivially destructible.
+
+The storage for the object is allocated in device local memory before
+calling the user's kernel lambda, and deallocated when all work-items
+in the work-group have completed execution of the kernel.
+
+Objects of type `work_group_static` must only be declared at namespace, block, lambda or class scope.
+If the object is declared in class scope, it must be declared as a static data member.
+
+SYCL implementations conforming to the full feature set treat
+`work_group_static` similarly to the `thread_local` keyword, and when
+a `work_group_static` object is declared at block scope it behaves
+as if the `static` keyword was specified implicitly. SYCL implementations
+conforming to the reduced feature set require the `static` keyword to be
+specified explicitly.
+
+[NOTE]
+====
+If a `work_group_static` object is declared at a block scope, the
+device local memory associated with the object will be identical for all
+usages of that function within the kernel. In cases where a function is called
+multiple times, developers must take care to avoid race conditions (e.g., by
+calling `group_barrier` before and after using the memory).
+====
+
+SYCL 2020 requires that all global variables accessed by a device function are
+`const` or `constexpr`. This extension lifts that restriction for
+`work_group_static` variables.
+
+Each instance of `work_group_static<T>` is associated
+with a unique allocation in device local memory.
+
+[source,c++]
+----
+operator T&() noexcept;
+----
+_Returns_: A reference to the object stored in the device local memory
+associated with this instance of `work_group_static`.
+
+[source,c++]
+----
+work_group_static& operator=(const T& value) noexcept;
+----
+_Constraints_: Available only if `std::is_array_v<T>` is false.
+
+_Effects_: Copy `value` into the storage of the `work_group_static` instance.
+
+_Returns_: A reference to this instance of `work_group_static`.
+
+[source,c++]
+----
+T* operator&() noexcept;
+----
+_Returns_: A pointer to the device local memory associated with this
+instance of `work_group_static`.
+
+==== Interaction with common address space deduction rules
+
+Objects of type `work_group_static` are assigned to
+the local address space.
+
+
+=== Total allocation check
+
+If the total amount of device local memory requested exceeds a device's
+local memory capacity as reported by `info::device::local_mem_size`
+then the implementation must throw a synchronous exception with the
+`errc::memory_allocation` error code from the kernel invocation command
+(e.g. `parallel_for`). This check must take all APIs that allocation device
+local memory into account, whether via the `work_group_scratch_size` property
+or other APIs such as `local_accessor`.
+
+== Example
+
+[source,c++]
+----
+namespace syclex = sycl::ext::oneapi::experimental;
+
+/* optional: static */ syclex::work_group_static<int> program_scope_scalar;
+/* optional: static */ syclex::work_group_static<int[16]> program_scope_array;
+
+class ClassScope {
+  static syclex::work_group_static<int> class_scope_scalar;
+};
+
+syclex::work_group_static<int> ClassScope::class_scope_scalar;
+
+void foo() {
+  /* optional: static */ syclex::work_group_static<int> function_scope_scalar;
+  function_scope_scalar = 1; // assignment via overloaded = operator
+  function_scope_scalar += 2; // += operator via implicit conversion to int&
+  class_scope_scalar = 3;
+  int* ptr = &function_scope_scalar; // conversion to pointer via overloaded & operator
+}
+
+void bar() {
+  /* optional: static */ syclex::work_group_static<int[64]> function_scope_array;
+  function_scope_array[0] = 1; // [] operator via implicit conversion to int(&)[64]
+  int* ptr = function_scope_array; // conversion to pointer via implicit conversion to int(&)[64]
+}
+----
+
+
+== Issues
+
+* We should clean up the wording regarding the scopes at which
+  `work_group_static` variables may be declared.
+  The current wording says they may be "allocated at global or function scope".
+  However, "function scope" is not a {cpp} term.
+  I assume we meant "block scope" here?
+  I assume we also meant "namespace scope" instead of "global scope"?
+  What about class scope or lambda scope?
+  Are we intentionally omitting those, or is that an oversight?
+  Are there any scopes where a `work_group_static` variable may not be declared?
+  If not, we should just say that they may be allocated at any scope.
+** Extension changed to use namespace, block, class and lambda scopes
+** Require `work_group_static` objects to be declared as static data members if used in a class scope
+** `Are there any scopes where a `work_group_static` variable may not be declared?` yes, function parameter scope and non-static data members (just like for `thread_storage`)
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_static.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_static.asciidoc
deleted file mode 100644
index d1a1d180f017a..0000000000000
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_static.asciidoc
+++ /dev/null
@@ -1,328 +0,0 @@
-= sycl_ext_oneapi_work_group_static
-
-:source-highlighter: coderay
-:coderay-linenums-mode: table
-
-// This section needs to be after the document title.
-:doctype: book
-:toc2:
-:toc: left
-:encoding: utf-8
-:lang: en
-:dpcpp: pass:[DPC++]
-
-// Set the default source code type in this document to C++,
-// for syntax highlighting purposes.  This is needed because
-// docbook uses c++ and html5 uses cpp.
-:language: {basebackend@docbook:c++:cpp}
-
-
-== Notice
-
-[%hardbreaks]
-Copyright (C) 2023 Intel Corporation.  All rights reserved.
-
-Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
-of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
-permission by Khronos.
-
-
-== Contact
-
-To report problems with this extension, please open a new issue at:
-
-https://github.com/intel/llvm/issues
-
-
-== Dependencies
-
-This extension is written against the SYCL 2020 revision 8 specification.  All
-references below to the "core SYCL specification" or to section numbers in the
-SYCL specification refer to that revision.
-
-The following extensions are required only for dynamic allocations:
-
-- link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties]
-
-- link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_kernel_properties]
-
-
-== Status
-
-This is a proposed extension specification, intended to gather community
-feedback.  Interfaces defined in this specification may not be implemented yet
-or may be in a preliminary state.  The specification itself may also change in
-incompatible ways before it is finalized.  *Shipping software products should
-not rely on APIs defined in this specification.*
-
-
-== Overview
-
-This extension adds a way to allocate device local memory, without passing a
-kernel argument.
-Device local memory is memory that is shared by all work-items in a work-group.
-The behavior is similar to the CUDA `+__shared__+` keyword, and the extension
-draws some inspiration from the {cpp} `thread_local` keyword.
-
-`work_group_static` variables can be allocated at global or function scope,
-lifting many of the restrictions in the existing
-link:../supported/sycl_ext_oneapi_local_memory.asciidoc[sycl_ext_oneapi_local_memory]
-extension. Note, however, that `work_group_static` variables currently place
-additional limits on the types that can be allocated, owing to differences in
-constructor behavior.
-
-
-== Specification
-
-=== Feature test macro
-
-This extension provides a feature-test macro as described in the core SYCL
-specification.  An implementation supporting this extension must predefine the
-macro `SYCL_EXT_ONEAPI_WORK_GROUP_SPECIFIC` to one of the values defined in the
-table below.  Applications can test for the existence of this macro to
-determine if the implementation supports this feature, or applications can test
-the macro's value to determine which of the extension's features the
-implementation supports.
-
-[%header,cols="1,5"]
-|===
-|Value
-|Description
-
-|1
-|The APIs of this experimental extension are not versioned, so the
- feature-test macro always has this value.
-|===
-
-
-=== `work_group_static` class template
-
-The `work_group_static` class template acts as a view of an
-implementation-managed pointer to device local memory.
-
-[source,c++]
-----
-namespace sycl::ext::oneapi::experimental {
-
-template <typename T>
-class work_group_static {
-public:
-
-  work_group_static() = default;
-  work_group_static(const work_group_static&) = delete;
-  work_group_static& operator=(const work_group_static&) = delete;
-
-  operator T&() const noexcept;
-
-  // Available only if: std::is_array_v<T> == false
-  const work_group_static& operator=(const T& value) const noexcept;
-
-  T* operator&() const noexcept;
-
-private:
-  T* ptr; // exposition only
-
-};
-
-} // namespace sycl::ext::oneapi::experimental
-----
-
-`T` must be one of the following:
-
-* A trivially constructible and trivially destructible type, or
-* An unbounded array of type `U`, where `U` is a trivially constructible and
-  trivially destructible type.
-
-The storage for the object is allocated in device local memory before
-calling the user's kernel lambda, and deallocated when all work-items
-in the work-group have completed execution of the kernel.
-
-SYCL implementations conforming to the full feature set treat
-`work_group_static` similarly to the `thread_local` keyword, and when
-a `work_group_static` object is declared at block scope it behaves
-as if the `static` keyword was specified implicitly. SYCL implementations
-conforming to the reduced feature set require the `static` keyword to be
-specified explicitly.
-
-[NOTE]
-====
-If a `work_group_static` object is declared at function scope, the
-device local memory associated with the object will be identical for all
-usages of that function within the kernel. In cases where a function is called
-multiple times, developers must take care to avoid race conditions (e.g., by
-calling `group_barrier` before and after using the memory).
-====
-
-SYCL 2020 requires that all global variables accessed by a device function are
-`const` or `constexpr`. This extension lifts that restriction for
-`work_group_static` variables.
-
-[NOTE]
-====
-Since `work_group_static` acts as a view, wrapping an underlying pointer, a
-developer may still choose to declare variables as `const`.
-====
-
-When `T` is a class type or bounded array, the size of the allocation is known
-at compile-time, and a SYCL implementation may embed the size of the allocation
-directly within a kernel. Each instance of `work_group_static<T>` is associated
-with a unique allocation in device local memory.
-
-When `T` is an unbounded array, the size of the allocation is unknown at
-compile-time, and must be communicated to the SYCL implementation via the
-`work_group_static_memory_size` property. Every instance of `work_group_static`
-for which `T` is an unbounded array is associated with a single, shared,
-allocation in device local memory. For example, two instances declared
-as `work_group_static<int[]>` and `work_group_static<float[]>` will be
-associated with the same shared allocation.
-
-If the total amount of device local memory requested (i.e., the sum of
-all memory requested by `local_accessor`, `group_local_memory`,
-`group_local_memory_for_overwrite` and `work_group_static`) exceeds a device's
-local memory capacity (as reported by `local_mem_size`) then the implementation
-must throw a synchronous `exception` with the `errc::memory_allocation` error
-code from the kernel invocation command (e.g. `parallel_for`).
-
-[source,c++]
-----
-operator T&() const noexcept;
-----
-_Returns_: A reference to the object stored in the device local memory
-associated with this instance of `work_group_static`.
-
-[source,c++]
-----
-const work_group_static<T>& operator=(const T& value) const noexcept;
-----
-_Constraints_: Available only if `std::is_array_v<T>>` is false.
-
-_Effects_: Replaces the value referenced by `*ptr` with `value`.
-
-_Returns_: A reference to this instance of `work_group_static`.
-
-[source,c++]
-----
-T* operator&() const noexcept;
-----
-_Returns_: A pointer to the device local memory associated with this
-instance of `work_group_static` (i.e., `ptr`).
-
-
-==== Kernel properties
-
-The `work_group_static_size` property must be passed to a kernel to determine
-the run-time size of the device local memory allocation associated with
-all `work_group_static` variables of unbounded array type.
-
-[source,c++]
-----
-namespace sycl::ext::oneapi::experimental {
-
-struct work_group_static_size {
-  constexpr work_group_static_size(size_t bytes) : value(bytes) {}
-  size_t value;
-}; // work_group_static_size
-
-using work_group_static_size_key = work_group_static_size;
-
-} // namespace sycl::ext::oneapi::experimental
-----
-
-|===
-|Property|Description
-
-|`work_group_static_size`
-|The `work_group_static_size` property describes the amount of dynamic
-device local memory required by the kernel in bytes.
-
-|===
-
-
-==== Usage examples
-
-===== Allocations with size known at compile-time
-
-[source,c++]
-----
-using namespace syclex = sycl::ext::oneapi::experimental;
-
-/* optional: static const */ syclex::work_group_static<int> program_scope_scalar;
-/* optional: static const */ syclex::work_group_static<int[16]> program_scope_array;
-
-void foo() {
-  /* optional: static const */ syclex::work_group_static<int> function_scope_scalar;
-  function_scope_scalar = 1; // assignment via overloaded = operator
-  function_scope_scalar += 2; // += operator via implicit conversion to int&
-  int* ptr = &function_scope_scalar; // conversion to pointer via overloaded & operator
-}
-
-void bar() {
-  /* optional: static const */ sylex::work_group_static<int[64]> function_scope_array;
-  function_scope_array[0] = 1; // [] operator via implicit conversion to int(&)[64]
-  int* ptr = function_scope_array; // conversion to pointer via implicit conversion to int(&)[64]
-}
-----
-
-===== Allocations with size unknown at compile-time
-
-[source,c++]
-----
-using namespace syclex = sycl::ext::oneapi::experimental;
-
-/* optional: static const */ syclex::work_group_static<int[]> dynamic_program_scope_array;
-
-...
-
-q.parallel_for(sycl::nd_range<1>{N, M},
-  syclex::properties{syclex::work_group_static_size(M * sizeof(int))},
-  [=](sycl::nd_item<1> it) {
-  ...
-});
-----
-
-
-== Implementation notes
-
-This non-normative section provides information about one possible
-implementation of this extension.  It is not part of the specification of the
-extension's API.
-
-For class types and bounded arrays, the class can be implemented on top of
-the existing `__sycl_allocateLocalMemory` intrinsic:
-[source,c++]
-----
-#ifdef __SYCL_DEVICE_ONLY__
-  __attribute__((opencl_local)) T *ptr = reinterpret_cast<__attribute__((opencl_local)) T *>(__sycl_allocateLocalMemory(sizeof(T), alignof(T)));
-#else
-  T *ptr{};
-#endif
-----
-
-Note, however, that implementing the correct semantics may require some
-adjustment to the handling of this intrinsic. A simple class as written above
-would create a separate allocation for every call to an inlined function.
-Creating device local memory allocations should be handled before inlining to
-prevent this.
-
-For unbounded arrays, a separate specialization of the class will be required,
-and the implementation may need to generate some additional code to
-appropriately initialize the pointer(s) wrapped by `work_group_static` objects.
-Alternatively, it may be possible to initialize the pointer to the beginning
-of the device's local memory region (if that value is known). Either way, the
-implementation must account for the existence of one or more `local_accessor`
-objects (which themselves may allocate a dynamic amount of device local
-memory).
-
-
-== Issues
-
-* We should clean up the wording regarding the scopes at which
-  `work_group_static` variables may be declared.
-  The current wording says they may be "allocated at global or function scope".
-  However, "function scope" is not a {cpp} term.
-  I assume we meant "block scope" here?
-  I assume we also meant "namespace scope" instead of "global scope"?
-  What about class scope or lambda scope?
-  Are we intentionally omitting those, or is that an oversight?
-  Are there any scopes where a `work_group_static` variable may not be declared?
-  If not, we should just say that they may be allocated at any scope.
diff --git a/sycl/include/sycl/detail/sycl_local_mem_builtins.hpp b/sycl/include/sycl/detail/sycl_local_mem_builtins.hpp
new file mode 100644
index 0000000000000..2510e06f932ca
--- /dev/null
+++ b/sycl/include/sycl/detail/sycl_local_mem_builtins.hpp
@@ -0,0 +1,25 @@
+//==----- sycl_local_mem_builtins.hpp --- SYCL local memory builtins ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/detail/defines_elementary.hpp> // for __DPCPP_SYCL_EXTERNAL
+
+#include <cstddef> // for size_t
+#include <cstdint> // for uint8_t
+
+#ifdef __SYCL_DEVICE_ONLY__
+// Request a fixed-size allocation in local address space at kernel scope.
+// Required for group_local_memory and work_group_static.
+extern "C" __DPCPP_SYCL_EXTERNAL __attribute__((opencl_local)) std::uint8_t *
+__sycl_allocateLocalMemory(std::size_t Size, std::size_t Alignment);
+// Request a placeholder for a dynamically-sized buffer in local address space
+// at kernel scope. Required for work_group_static.
+extern "C" __DPCPP_SYCL_EXTERNAL __attribute__((opencl_local)) std::uint8_t *
+__sycl_dynamicLocalMemoryPlaceholder();
+#endif
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index e205d939b3b30..ca30b8125a5ae 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -59,7 +59,8 @@ enum class UnsupportedGraphFeatures {
   sycl_ext_oneapi_device_global = 6,
   sycl_ext_oneapi_bindless_images = 7,
   sycl_ext_oneapi_experimental_cuda_cluster_launch = 8,
-  sycl_ext_codeplay_enqueue_native_command = 9
+  sycl_ext_codeplay_enqueue_native_command = 9,
+  sycl_ext_oneapi_work_group_scratch_memory = 10
 };
 
 inline const char *
@@ -86,6 +87,8 @@ UnsupportedFeatureToString(UnsupportedGraphFeatures Feature) {
     return "sycl_ext_oneapi_experimental_cuda_cluster_launch";
   case UGF::sycl_ext_codeplay_enqueue_native_command:
     return "sycl_ext_codeplay_enqueue_native_command";
+  case UGF::sycl_ext_oneapi_work_group_scratch_memory:
+    return "sycl_ext_oneapi_work_group_scratch_memory";
   }
 
   assert(false && "Unhandled graphs feature");
diff --git a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp
index 916e5849054a1..6e65b9acffe8e 100644
--- a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp
+++ b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp
@@ -9,19 +9,14 @@
 
 #include <sycl/access/access.hpp>             // for address_space, decorated
 #include <sycl/detail/defines_elementary.hpp> // for __SYCL_ALWAYS_INLINE
-#include <sycl/detail/type_traits.hpp>        // for is_group
-#include <sycl/exception.hpp>                 // for exception
-#include <sycl/ext/intel/usm_pointers.hpp>    // for multi_ptr
-#include <sycl/group.hpp>                     // for workGroupBarrier
+#include <sycl/detail/sycl_local_mem_builtins.hpp> // for __sycl_allocateLocalMemory
+#include <sycl/detail/type_traits.hpp>             // for is_group
+#include <sycl/exception.hpp>                      // for exception
+#include <sycl/ext/intel/usm_pointers.hpp>         // for multi_ptr
+#include <sycl/group.hpp>                          // for workGroupBarrier
 
 #include <type_traits> // for enable_if_t
 
-#ifdef __SYCL_DEVICE_ONLY__
-// Request a fixed-size allocation in local address space at kernel scope.
-extern "C" __DPCPP_SYCL_EXTERNAL __attribute__((opencl_local)) std::uint8_t *
-__sycl_allocateLocalMemory(std::size_t Size, std::size_t Alignment);
-#endif
-
 namespace sycl {
 inline namespace _V1 {
 namespace ext::oneapi {
diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp
index 23a19b9a8f80e..a473da3b8aa7c 100644
--- a/sycl/include/sycl/ext/oneapi/properties/property.hpp
+++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp
@@ -221,8 +221,9 @@ enum PropKind : uint32_t {
   Prefetch = 76,
   Deterministic = 77,
   InitializeToIdentity = 78,
+  WorkGroupScratchSize = 79,
   // PropKindSize must always be the last value.
-  PropKindSize = 79,
+  PropKindSize = 80,
 };
 
 template <typename PropertyT> struct PropertyToKind {
diff --git a/sycl/include/sycl/ext/oneapi/work_group_scratch_memory.hpp b/sycl/include/sycl/ext/oneapi/work_group_scratch_memory.hpp
new file mode 100644
index 0000000000000..bf7cc05a728bb
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/work_group_scratch_memory.hpp
@@ -0,0 +1,54 @@
+//==--- work_group_scratch_memory.hpp - SYCL group local memory extension --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <sycl/detail/defines_elementary.hpp>      // for __SYCL_ALWAYS_INLINE
+#include <sycl/detail/sycl_local_mem_builtins.hpp> // for __sycl_allocateLocalMemory
+#include <sycl/exception.hpp>                      // for exception
+#include <sycl/ext/oneapi/properties/properties.hpp> // for properties
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext::oneapi {
+namespace experimental {
+
+__SYCL_ALWAYS_INLINE
+inline void *get_work_group_scratch_memory() {
+#ifdef __SYCL_DEVICE_ONLY__
+  return __sycl_dynamicLocalMemoryPlaceholder();
+#else
+  throw sycl::exception(
+      sycl::errc::feature_not_supported,
+      "sycl_ext_oneapi_work_scratch_memory extension is not supported on host");
+#endif
+}
+
+// Property
+struct work_group_scratch_size
+    : ::sycl::ext::oneapi::experimental::detail::run_time_property_key<
+          work_group_scratch_size, ::sycl::ext::oneapi::experimental::detail::
+                                       PropKind::WorkGroupScratchSize> {
+  // Runtime property part
+  constexpr work_group_scratch_size(size_t bytes) : size(bytes) {}
+
+  size_t size;
+};
+
+using work_group_scratch_size_key = work_group_scratch_size;
+
+namespace detail {
+template <> struct PropertyMetaInfo<work_group_scratch_size> {
+  static constexpr const char *name = "sycl-work-group-static";
+  static constexpr int value = 1;
+};
+
+} // namespace detail
+} // namespace experimental
+} // namespace ext::oneapi
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/work_group_static.hpp b/sycl/include/sycl/ext/oneapi/work_group_static.hpp
new file mode 100644
index 0000000000000..4425b05e9cff7
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/work_group_static.hpp
@@ -0,0 +1,62 @@
+//==----- work_group_static.hpp --- SYCL group local memory extension -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <sycl/detail/defines_elementary.hpp> // for __SYCL_ALWAYS_INLINE
+#include <sycl/exception.hpp>                 // for exception
+
+#include <type_traits> // for enable_if_t, is_trivially_destructible_v ...
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext::oneapi {
+namespace experimental {
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define __SYCL_WG_SCOPE [[__sycl_detail__::wg_scope]]
+#else
+#define __SYCL_WG_SCOPE
+#endif
+
+/// @brief Allocate data in device local memory.
+/// Any work_group_static object will be place in device local memory and hold
+/// an object of type T. work_group_static object are implicitly treated as
+/// static.
+/// @tparam T must be a trivially constructible and destructible type
+template <typename T> class __SYCL_WG_SCOPE work_group_static final {
+public:
+  static_assert(
+      std::is_trivially_destructible_v<T> &&
+          std::is_trivially_constructible_v<T>,
+      "Can only be used with trivially constructible and destructible types");
+  static_assert(!std::is_const_v<T> && !std::is_volatile_v<T>,
+                "Can only be used with non const and non volatile types");
+  __SYCL_ALWAYS_INLINE work_group_static() = default;
+  work_group_static(const work_group_static &) = delete;
+  work_group_static &operator=(const work_group_static &) = delete;
+
+  operator T &() noexcept { return data; }
+
+  template <class TArg = T, typename = std::enable_if_t<!std::is_array_v<TArg>>>
+  work_group_static &operator=(const T &value) noexcept {
+    data = value;
+    return *this;
+  }
+
+  T *operator&() noexcept { return &data; }
+
+private:
+  T data;
+};
+
+#undef __SYCL_WG_SCOPE
+
+} // namespace experimental
+} // namespace ext::oneapi
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 3b107e7a54fba..92a2e4428fe4b 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -39,6 +39,7 @@
 #include <sycl/ext/oneapi/experimental/virtual_functions.hpp>
 #include <sycl/ext/oneapi/kernel_properties/properties.hpp>
 #include <sycl/ext/oneapi/properties/properties.hpp>
+#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
 #include <sycl/group.hpp>
 #include <sycl/id.hpp>
 #include <sycl/item.hpp>
@@ -814,6 +815,14 @@ class __SYCL_EXPORT handler {
           prop.coordinationScope);
     }
 
+    if constexpr (PropertiesT::template has_property<
+                      sycl::ext::oneapi::experimental::
+                          work_group_scratch_size>()) {
+      auto WorkGroupMemSize = Props.template get_property<
+          sycl::ext::oneapi::experimental::work_group_scratch_size>();
+      setKernelWorkGroupMem(WorkGroupMemSize.size);
+    }
+
     checkAndSetClusterRange(Props);
   }
 
@@ -3521,6 +3530,9 @@ class __SYCL_EXPORT handler {
   // Set using cuda thread block cluster launch flag and set the launch bounds.
   void setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims);
 
+  // Set the request work group memory size (work_group_static ext).
+  void setKernelWorkGroupMem(size_t Size);
+
   // Various checks that are only meaningful for host compilation, because they
   // result in runtime errors (i.e. exceptions being thrown). To save time
   // during device compilations (by reducing amount of templates we have to
diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp
index f0dadad99dac5..5f08485159187 100644
--- a/sycl/source/detail/cg.hpp
+++ b/sycl/source/detail/cg.hpp
@@ -263,6 +263,7 @@ class CGExecKernel : public CG {
   ur_kernel_cache_config_t MKernelCacheConfig;
   bool MKernelIsCooperative = false;
   bool MKernelUsesClusterLaunch = false;
+  size_t MKernelWorkGroupMemorySize = 0;
 
   CGExecKernel(NDRDescT NDRDesc, std::shared_ptr<HostKernelBase> HKernel,
                std::shared_ptr<detail::kernel_impl> SyclKernel,
@@ -273,7 +274,7 @@ class CGExecKernel : public CG {
                std::vector<std::shared_ptr<const void>> AuxiliaryResources,
                CGType Type, ur_kernel_cache_config_t KernelCacheConfig,
                bool KernelIsCooperative, bool MKernelUsesClusterLaunch,
-               detail::code_location loc = {})
+               size_t KernelWorkGroupMemorySize, detail::code_location loc = {})
       : CG(Type, std::move(CGData), std::move(loc)),
         MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
         MSyclKernel(std::move(SyclKernel)),
@@ -282,7 +283,8 @@ class CGExecKernel : public CG {
         MAuxiliaryResources(std::move(AuxiliaryResources)),
         MAlternativeKernels{}, MKernelCacheConfig(std::move(KernelCacheConfig)),
         MKernelIsCooperative(KernelIsCooperative),
-        MKernelUsesClusterLaunch(MKernelUsesClusterLaunch) {
+        MKernelUsesClusterLaunch(MKernelUsesClusterLaunch),
+        MKernelWorkGroupMemorySize(KernelWorkGroupMemorySize) {
     assert(getType() == CGType::Kernel && "Wrong type of exec kernel CG.");
   }
 
diff --git a/sycl/source/detail/compiler.hpp b/sycl/source/detail/compiler.hpp
index 35f313ceec3f5..40bf97299138f 100644
--- a/sycl/source/detail/compiler.hpp
+++ b/sycl/source/detail/compiler.hpp
@@ -66,6 +66,8 @@
 #define __SYCL_PROPERTY_SET_SYCL_HOST_PIPES "SYCL/host pipes"
 /// PropertySetRegistry::SYCL_VIRTUAL_FUNCTIONS defined in PropertySetIO.h
 #define __SYCL_PROPERTY_SET_SYCL_VIRTUAL_FUNCTIONS "SYCL/virtual functions"
+/// PropertySetRegistry::SYCL_IMPLICIT_LOCAL_ARG defined in PropertySetIO.h
+#define __SYCL_PROPERTY_SET_SYCL_IMPLICIT_LOCAL_ARG "SYCL/implicit local arg"
 
 /// Program metadata tags recognized by the PI backends. For kernels the tag
 /// must appear after the kernel name.
diff --git a/sycl/source/detail/device_binary_image.cpp b/sycl/source/detail/device_binary_image.cpp
index 2be48d4a38fce..633a4269e1e78 100644
--- a/sycl/source/detail/device_binary_image.cpp
+++ b/sycl/source/detail/device_binary_image.cpp
@@ -182,6 +182,7 @@ void RTDeviceBinaryImage::init(sycl_device_binary Bin) {
   DeviceLibReqMask.init(Bin, __SYCL_PROPERTY_SET_DEVICELIB_REQ_MASK);
   KernelParamOptInfo.init(Bin, __SYCL_PROPERTY_SET_KERNEL_PARAM_OPT_INFO);
   AssertUsed.init(Bin, __SYCL_PROPERTY_SET_SYCL_ASSERT_USED);
+  ImplicitLocalArg.init(Bin, __SYCL_PROPERTY_SET_SYCL_IMPLICIT_LOCAL_ARG);
   ProgramMetadata.init(Bin, __SYCL_PROPERTY_SET_PROGRAM_METADATA);
   // Convert ProgramMetadata into the UR format
   for (const auto &Prop : ProgramMetadata) {
diff --git a/sycl/source/detail/device_binary_image.hpp b/sycl/source/detail/device_binary_image.hpp
index 203427b89ca45..1053ea72668b6 100644
--- a/sycl/source/detail/device_binary_image.hpp
+++ b/sycl/source/detail/device_binary_image.hpp
@@ -230,6 +230,7 @@ class RTDeviceBinaryImage {
   }
   const PropertyRange &getHostPipes() const { return HostPipes; }
   const PropertyRange &getVirtualFunctions() const { return VirtualFunctions; }
+  const PropertyRange &getImplicitLocalArg() const { return ImplicitLocalArg; }
 
   std::uintptr_t getImageID() const {
     assert(Bin && "Image ID is not available without a binary image.");
@@ -255,6 +256,7 @@ class RTDeviceBinaryImage {
   RTDeviceBinaryImage::PropertyRange DeviceRequirements;
   RTDeviceBinaryImage::PropertyRange HostPipes;
   RTDeviceBinaryImage::PropertyRange VirtualFunctions;
+  RTDeviceBinaryImage::PropertyRange ImplicitLocalArg;
 
   std::vector<ur_program_metadata_t> ProgramMetadataUR;
 
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 1fa48554843f8..1ddf36b6e8b92 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -1082,7 +1082,7 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
               nullptr,
               // TODO: Extract from handler
               UR_KERNEL_CACHE_CONFIG_DEFAULT, CG->MKernelIsCooperative,
-              CG->MKernelUsesClusterLaunch);
+              CG->MKernelUsesClusterLaunch, CG->MKernelWorkGroupMemorySize);
           ScheduledEvents.push_back(NewEvent);
         } else if (!NodeImpl->isEmpty()) {
           // Empty nodes are node processed as other nodes, but only their
diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp
index e452eca0c8a6d..cf776bcbc2cc3 100644
--- a/sycl/source/detail/handler_impl.hpp
+++ b/sycl/source/detail/handler_impl.hpp
@@ -121,6 +121,7 @@ class handler_impl {
 
   bool MKernelIsCooperative = false;
   bool MKernelUsesClusterLaunch = false;
+  uint32_t MKernelWorkGroupMemorySize = 0;
 
   // Extra information for bindless image copy
   ur_image_desc_t MSrcImageDesc = {};
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index ce498356e6191..daec9af9ff6dc 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -1021,7 +1021,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
       std::move(CGData), std::move(FusedArgs), FusedOrCachedKernelName, {}, {},
       CGType::Kernel, KernelCacheConfig, false /* KernelIsCooperative */,
-      false /* KernelUsesClusterLaunch*/));
+      false /* KernelUsesClusterLaunch*/, 0 /* KernelWorkGroupMemorySize */));
   return FusedCG;
 }
 
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 58b6aa05afed6..a7e2d91c360b6 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -1766,6 +1766,24 @@ bool ProgramManager::kernelUsesAssert(const std::string &KernelName) const {
   return m_KernelUsesAssert.find(KernelName) != m_KernelUsesAssert.end();
 }
 
+void ProgramManager::cacheKernelImplicitLocalArg(RTDeviceBinaryImage &Img) {
+  const RTDeviceBinaryImage::PropertyRange &ImplicitLocalArgRange =
+      Img.getImplicitLocalArg();
+  if (ImplicitLocalArgRange.isAvailable())
+    for (auto Prop : ImplicitLocalArgRange) {
+      m_KernelImplicitLocalArgPos[Prop->Name] =
+          DeviceBinaryProperty(Prop).asUint32();
+    }
+}
+
+std::optional<int>
+ProgramManager::kernelImplicitLocalArgPos(const std::string &KernelName) const {
+  auto it = m_KernelImplicitLocalArgPos.find(KernelName);
+  if (it != m_KernelImplicitLocalArgPos.end())
+    return it->second;
+  return {};
+}
+
 void ProgramManager::addImages(sycl_device_binaries DeviceBinary) {
   const bool DumpImages = std::getenv("SYCL_DUMP_IMAGES") && !m_UseSpvFile;
   for (int I = 0; I < DeviceBinary->NumDeviceBinaries; I++) {
@@ -1876,6 +1894,8 @@ void ProgramManager::addImages(sycl_device_binaries DeviceBinary) {
           Prop && (detail::DeviceBinaryProperty(Prop).asUint32() != 0);
     }
 
+    cacheKernelImplicitLocalArg(*Img);
+
     // Sort kernel ids for faster search
     std::sort(m_BinImg2KernelIDs[Img.get()]->begin(),
               m_BinImg2KernelIDs[Img.get()]->end(), LessByHash<kernel_id>{});
diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp
index 75689aadfb290..8f228dd772fb0 100644
--- a/sycl/source/detail/program_manager/program_manager.hpp
+++ b/sycl/source/detail/program_manager/program_manager.hpp
@@ -306,6 +306,9 @@ class ProgramManager {
 
   bool kernelUsesAssert(const std::string &KernelName) const;
 
+  std::optional<int>
+  kernelImplicitLocalArgPos(const std::string &KernelName) const;
+
   bool kernelUsesAsan() const { return m_AsanFoundInImage; }
 
   std::set<RTDeviceBinaryImage *>
@@ -331,6 +334,9 @@ class ProgramManager {
   /// Add info on kernels using assert into cache
   void cacheKernelUsesAssertInfo(RTDeviceBinaryImage &Img);
 
+  /// Add info on kernels using local arg into cache
+  void cacheKernelImplicitLocalArg(RTDeviceBinaryImage &Img);
+
   std::set<RTDeviceBinaryImage *>
   collectDeviceImageDepsForImportedSymbols(const RTDeviceBinaryImage &Img,
                                            device Dev);
@@ -432,6 +438,7 @@ class ProgramManager {
   RTDeviceBinaryImageUPtr m_SpvFileImage;
 
   std::set<std::string> m_KernelUsesAssert;
+  std::unordered_map<std::string, int> m_KernelImplicitLocalArgPos;
 
   // True iff there is a device image compiled with AddressSanitizer
   bool m_AsanFoundInImage;
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 5c42709930436..84c2af5bdfabe 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2427,7 +2427,8 @@ static ur_result_t SetKernelParamsAndLaunch(
     const KernelArgMask *EliminatedArgMask,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     bool IsCooperative, bool KernelUsesClusterLaunch,
-    const RTDeviceBinaryImage *BinImage, const std::string &KernelName) {
+    uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage,
+    const std::string &KernelName) {
   assert(Queue && "Kernel submissions should have an associated queue");
   const AdapterPtr &Adapter = Queue->getAdapter();
 
@@ -2447,6 +2448,17 @@ static ur_result_t SetKernelParamsAndLaunch(
 
   applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
 
+  std::optional<int> ImplicitLocalArg =
+      ProgramManager::getInstance().kernelImplicitLocalArgPos(KernelName);
+  // Set the implicit local memory buffer to support
+  // get_work_group_scratch_memory. This is for backend not supporting
+  // CUDA-style local memory setting. Note that we may have -1 as a position,
+  // this indicates the buffer is actually unused and was elided.
+  if (ImplicitLocalArg.has_value() && ImplicitLocalArg.value() != -1) {
+    Adapter->call<UrApiKind::urKernelSetArgLocal>(
+        Kernel, ImplicitLocalArg.value(), WorkGroupMemorySize, nullptr);
+  }
+
   adjustNDRangePerKernel(NDRDesc, Kernel, *(Queue->getDeviceImplPtr()));
 
   // Remember this information before the range dimensions are reversed
@@ -2474,9 +2486,8 @@ static ur_result_t SetKernelParamsAndLaunch(
   }
   if (OutEventImpl != nullptr)
     OutEventImpl->setHostEnqueueTime();
+  std::vector<ur_exp_launch_property_t> property_list;
   if (KernelUsesClusterLaunch) {
-    std::vector<ur_exp_launch_property_t> property_list;
-
     ur_exp_launch_property_value_t launch_property_value_cluster_range;
     launch_property_value_cluster_range.clusterDim[0] =
         NDRDesc.ClusterDimensions[0];
@@ -2494,13 +2505,20 @@ static ur_result_t SetKernelParamsAndLaunch(
       property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE,
                                launch_property_value_cooperative});
     }
-
+  }
+  // If there is no implicit arg, let the driver handle it via a property
+  if (WorkGroupMemorySize && !ImplicitLocalArg.has_value()) {
+    property_list.push_back(
+        {UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY, WorkGroupMemorySize});
+  }
+  if (!property_list.empty()) {
     ur_event_handle_t UREvent = nullptr;
     ur_result_t Error =
         Adapter->call_nocheck<UrApiKind::urEnqueueKernelLaunchCustomExp>(
-            Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalSize[0],
-            LocalSize, property_list.size(), property_list.data(),
-            RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
+            Queue->getHandleRef(), Kernel, NDRDesc.Dims,
+            &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize,
+            property_list.size(), property_list.data(), RawEvents.size(),
+            RawEvents.empty() ? nullptr : &RawEvents[0],
             OutEventImpl ? &UREvent : nullptr);
     if (OutEventImpl) {
       OutEventImpl->setHandle(UREvent);
@@ -2692,7 +2710,8 @@ void enqueueImpKernel(
     const detail::EventImplPtr &OutEventImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative,
-    const bool KernelUsesClusterLaunch, const RTDeviceBinaryImage *BinImage) {
+    const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize,
+    const RTDeviceBinaryImage *BinImage) {
   assert(Queue && "Kernel submissions should have an associated queue");
   // Run OpenCL kernel
   auto ContextImpl = Queue->getContextImplPtr();
@@ -2784,7 +2803,8 @@ void enqueueImpKernel(
     Error = SetKernelParamsAndLaunch(
         Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
         OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
-        KernelIsCooperative, KernelUsesClusterLaunch, BinImage, KernelName);
+        KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize,
+        BinImage, KernelName);
 
     const AdapterPtr &Adapter = Queue->getAdapter();
     if (!SyclKernelImpl && !MSyclKernel) {
@@ -3176,7 +3196,8 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
                      SyclKernel, KernelName, RawEvents, EventImpl,
                      getMemAllocationFunc, ExecKernel->MKernelCacheConfig,
                      ExecKernel->MKernelIsCooperative,
-                     ExecKernel->MKernelUsesClusterLaunch, BinImage);
+                     ExecKernel->MKernelUsesClusterLaunch,
+                     ExecKernel->MKernelWorkGroupMemorySize, BinImage);
 
     return UR_RESULT_SUCCESS;
   }
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 1aecf5ed4eabb..a49f52fbbc436 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -627,7 +627,7 @@ void enqueueImpKernel(
     const detail::EventImplPtr &Event,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     ur_kernel_cache_config_t KernelCacheConfig, bool KernelIsCooperative,
-    const bool KernelUsesClusterLaunch,
+    const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize,
     const RTDeviceBinaryImage *BinImage = nullptr);
 
 /// The exec CG command enqueues execution of kernel or explicit memory
diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in
index c1e62f5492abe..78199433a9e8b 100644
--- a/sycl/source/feature_test.hpp.in
+++ b/sycl/source/feature_test.hpp.in
@@ -110,6 +110,8 @@ inline namespace _V1 {
 #define SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND 1
 #define SYCL_EXT_ONEAPI_GET_KERNEL_INFO 1
 #define SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY 1
+#define SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY 1
+#define SYCL_EXT_ONEAPI_WORK_GROUP_STATIC 1
 // In progress yet
 #define SYCL_EXT_ONEAPI_ATOMIC16 0
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 1daa436b1ee87..7cdfa2ab48fdb 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -290,7 +290,8 @@ event handler::finalize() {
                          KernelBundleImpPtr, MKernel, MKernelName.c_str(),
                          RawEvents, NewEvent, nullptr, impl->MKernelCacheConfig,
                          impl->MKernelIsCooperative,
-                         impl->MKernelUsesClusterLaunch, BinImage);
+                         impl->MKernelUsesClusterLaunch,
+                         impl->MKernelWorkGroupMemorySize, BinImage);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
         // Emit signal only when event is created
         if (NewEvent != nullptr) {
@@ -349,7 +350,8 @@ event handler::finalize() {
         std::move(impl->MArgs), MKernelName.c_str(), std::move(MStreamStorage),
         std::move(impl->MAuxiliaryResources), getType(),
         impl->MKernelCacheConfig, impl->MKernelIsCooperative,
-        impl->MKernelUsesClusterLaunch, MCodeLoc));
+        impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize,
+        MCodeLoc));
     break;
   }
   case detail::CGType::CopyAccToPtr:
@@ -1949,6 +1951,12 @@ void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) {
   impl->MNDRDesc.setClusterDimensions(ClusterSize, Dims);
 }
 
+void handler::setKernelWorkGroupMem(size_t Size) {
+  throwIfGraphAssociated<syclex::detail::UnsupportedGraphFeatures::
+                             sycl_ext_oneapi_work_group_scratch_memory>();
+  impl->MKernelWorkGroupMemorySize = Size;
+}
+
 void handler::ext_oneapi_graph(
     ext::oneapi::experimental::command_graph<
         ext::oneapi::experimental::graph_state::executable>
diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/copy_dynamic_size.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/copy_dynamic_size.cpp
new file mode 100644
index 0000000000000..1f61653efc44e
--- /dev/null
+++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/copy_dynamic_size.cpp
@@ -0,0 +1,56 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+
+// UNSUPPORTED: gpu-intel-gen12
+// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
+#include <sycl/group_barrier.hpp>
+#include <sycl/usm.hpp>
+
+constexpr size_t Size = 1024;
+using DataType = int;
+
+namespace sycl_ext = sycl::ext::oneapi::experimental;
+
+void copy_via_smem(DataType *a, DataType *b, sycl::nd_item<1> it) {
+  // one work-group copies data to shared memory from A
+  // And then puts in back into B
+
+  DataType *smem_ptr =
+      reinterpret_cast<DataType *>(sycl_ext::get_work_group_scratch_memory());
+  auto threadIdx_x = it.get_local_linear_id();
+
+  smem_ptr[threadIdx_x] = a[threadIdx_x];
+  sycl::group_barrier(it.get_group());
+
+  b[threadIdx_x] = smem_ptr[threadIdx_x];
+}
+
+int main() {
+  sycl::queue queue;
+  DataType *a = sycl::malloc_device<DataType>(Size, queue);
+  DataType *b = sycl::malloc_device<DataType>(Size, queue);
+  std::vector<DataType> a_host(Size, 1.0);
+  std::vector<DataType> b_host(Size, -5.0);
+
+  queue.copy(a_host.data(), a, Size).wait_and_throw();
+
+  queue
+      .submit([&](sycl::handler &cgh) {
+        cgh.parallel_for(sycl::nd_range<1>({Size}, {Size}),
+                         sycl_ext::properties{sycl_ext::work_group_scratch_size(
+                             Size * sizeof(DataType))},
+                         [=](sycl::nd_item<1> it) { copy_via_smem(a, b, it); });
+      })
+      .wait_and_throw();
+
+  queue.copy(b, b_host.data(), Size).wait_and_throw();
+  for (size_t i = 0; i < b_host.size(); i++) {
+    assert(b_host[i] == a_host[i]);
+  }
+  sycl::free(a, queue);
+  sycl::free(b, queue);
+}
diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_local_accessor.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_local_accessor.cpp
new file mode 100644
index 0000000000000..a407d56077d5c
--- /dev/null
+++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_local_accessor.cpp
@@ -0,0 +1,74 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+
+// UNSUPPORTED: gpu-intel-gen12
+// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072
+
+// Test work_group_dynamic extension with allocation size specified at runtime
+// and an additional local accessor.
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
+
+#include <vector>
+
+constexpr size_t WgSize = 32;
+constexpr size_t WgCount = 4;
+constexpr size_t RepeatWG = 16;
+constexpr size_t ElemPerWG = WgSize * RepeatWG;
+constexpr size_t Size = WgSize * WgCount * RepeatWG;
+
+using namespace sycl;
+
+namespace sycl_ext = sycl::ext::oneapi::experimental;
+
+int main() {
+  queue Q;
+  std::vector<int> Vec(Size, 0);
+  buffer<int, 1> Buf{Vec.data(), range<1>(Size)};
+
+  Q.submit([&](handler &Cgh) {
+    auto Acc = Buf.get_access<access::mode::read_write>(Cgh);
+    sycl_ext::work_group_scratch_size static_size(WgSize * RepeatWG *
+                                                  sizeof(int));
+    sycl_ext::properties properties{static_size};
+    auto LocalAccessor =
+        sycl::local_accessor<int>(WgSize * RepeatWG * sizeof(int), Cgh);
+    Cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties,
+                     [=](nd_item<1> Item) {
+                       int *Ptr = reinterpret_cast<int *>(
+                           sycl_ext::get_work_group_scratch_memory());
+                       size_t GroupOffset =
+                           Item.get_group_linear_id() * ElemPerWG;
+                       for (size_t I = 0; I < RepeatWG; ++I) {
+                         Ptr[WgSize * I + Item.get_local_linear_id()] =
+                             Item.get_local_linear_id();
+                       }
+                       Item.barrier();
+
+                       for (size_t I = 0; I < RepeatWG; ++I) {
+                         // Check that the local accessor works.
+                         size_t LocalIdx = Item.get_local_linear_id() ^ 1;
+                         LocalAccessor[WgSize * I + LocalIdx] =
+                             Ptr[WgSize * I + LocalIdx] + 1;
+                       }
+                       Item.barrier();
+
+                       for (size_t I = 0; I < RepeatWG; ++I) {
+                         // Check that the memory is accessible from other
+                         // work-items
+                         size_t BaseIdx = GroupOffset + (I * WgSize);
+                         size_t LocalIdx = Item.get_local_linear_id();
+                         size_t GlobalIdx = BaseIdx + LocalIdx;
+                         Acc[GlobalIdx] = LocalAccessor[WgSize * I + LocalIdx];
+                       }
+                     });
+  });
+
+  host_accessor Acc(Buf, read_only);
+  for (size_t I = 0; I < Size; ++I) {
+    std::cout << I << ": " << Acc[I] << std::endl;
+    assert(Acc[I] == I % WgSize + 1);
+  }
+}
diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_ptr_alias.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_ptr_alias.cpp
new file mode 100644
index 0000000000000..e56ccc66bc364
--- /dev/null
+++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_alloc_ptr_alias.cpp
@@ -0,0 +1,68 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+
+// UNSUPPORTED: gpu-intel-gen12
+// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072
+
+// Test work_group_dynamic extension with allocation size specified at runtime
+// and multiple calls to the extension inside the kernel.
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
+
+#include <vector>
+
+constexpr size_t WgSize = 32;
+constexpr size_t WgCount = 4;
+constexpr size_t RepeatWG = 16;
+constexpr size_t ElemPerWG = WgSize * RepeatWG;
+constexpr size_t Size = WgSize * WgCount * RepeatWG;
+
+using namespace sycl;
+
+namespace sycl_ext = sycl::ext::oneapi::experimental;
+
+int main() {
+  queue Q;
+  std::vector<int> Vec(Size, 0);
+  buffer<int, 1> Buf{Vec.data(), range<1>(Size)};
+
+  Q.submit([&](handler &Cgh) {
+    auto Acc = Buf.get_access<access::mode::read_write>(Cgh);
+    sycl_ext::work_group_scratch_size static_size(WgSize * RepeatWG *
+                                                  sizeof(int));
+    sycl_ext::properties properties{static_size};
+    Cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties,
+                     [=](nd_item<1> Item) {
+                       int *Ptr = reinterpret_cast<int *>(
+                           sycl_ext::get_work_group_scratch_memory());
+                       size_t GroupOffset =
+                           Item.get_group_linear_id() * ElemPerWG;
+                       for (size_t I = 0; I < RepeatWG; ++I) {
+                         Ptr[WgSize * I + Item.get_local_linear_id()] =
+                             Item.get_local_linear_id();
+                       }
+
+                       Item.barrier();
+                       // Check that multiple calls return the same pointer.
+                       unsigned int *PtrAlias =
+                           reinterpret_cast<unsigned int *>(
+                               sycl_ext::get_work_group_scratch_memory());
+
+                       for (size_t I = 0; I < RepeatWG; ++I) {
+                         // Check that the memory is accessible from other
+                         // work-items
+                         size_t BaseIdx = GroupOffset + (I * WgSize);
+                         size_t LocalIdx = Item.get_local_linear_id() ^ 1;
+                         size_t GlobalIdx = BaseIdx + LocalIdx;
+                         Acc[GlobalIdx] = PtrAlias[WgSize * I + LocalIdx];
+                       }
+                     });
+  });
+
+  host_accessor Acc(Buf, read_only);
+  for (size_t I = 0; I < Size; ++I) {
+    assert(Acc[I] == I % WgSize);
+  }
+}
diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_allocation.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_allocation.cpp
new file mode 100644
index 0000000000000..76032271d7b5c
--- /dev/null
+++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_allocation.cpp
@@ -0,0 +1,62 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+
+// UNSUPPORTED: gpu-intel-gen12
+// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072
+
+// Test work_group_dynamic extension with allocation size specified at runtime.
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
+
+#include <vector>
+
+constexpr size_t WgSize = 32;
+constexpr size_t WgCount = 4;
+constexpr size_t RepeatWG = 16;
+constexpr size_t ElemPerWG = WgSize * RepeatWG;
+constexpr size_t Size = WgSize * WgCount * RepeatWG;
+
+using namespace sycl;
+
+namespace sycl_ext = sycl::ext::oneapi::experimental;
+
+int main() {
+  queue Q;
+  std::vector<int> Vec(Size, 0);
+  buffer<int, 1> Buf{Vec.data(), range<1>(Size)};
+
+  Q.submit([&](handler &Cgh) {
+    auto Acc = Buf.get_access<access::mode::read_write>(Cgh);
+    sycl_ext::work_group_scratch_size static_size(WgSize * RepeatWG *
+                                                  sizeof(int));
+    sycl_ext::properties properties{static_size};
+    Cgh.parallel_for(nd_range<1>(range<1>(Size), range<1>(WgSize)), properties,
+                     [=](nd_item<1> Item) {
+                       int *Ptr = reinterpret_cast<int *>(
+                           sycl_ext::get_work_group_scratch_memory());
+                       size_t GroupOffset =
+                           Item.get_group_linear_id() * ElemPerWG;
+                       for (size_t I = 0; I < RepeatWG; ++I) {
+                         Ptr[WgSize * I + Item.get_local_linear_id()] =
+                             Item.get_local_linear_id();
+                       }
+
+                       Item.barrier();
+                       for (size_t I = 0; I < RepeatWG; ++I) {
+                         // Check that the memory is accessible from other
+                         // work-items
+                         size_t BaseIdx = GroupOffset + (I * WgSize);
+                         size_t LocalIdx = Item.get_local_linear_id() ^ 1;
+                         size_t GlobalIdx = BaseIdx + LocalIdx;
+                         Acc[GlobalIdx] = Ptr[WgSize * I + LocalIdx];
+                       }
+                     });
+  });
+
+  host_accessor Acc(Buf, read_only);
+  for (size_t I = 0; I < Size; ++I) {
+    assert(Acc[I] == I % WgSize);
+  }
+}
diff --git a/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_unused.cpp b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_unused.cpp
new file mode 100644
index 0000000000000..e427305c18ed3
--- /dev/null
+++ b/sycl/test-e2e/WorkGroupMemory/Dynamic/dynamic_unused.cpp
@@ -0,0 +1,44 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+
+// UNSUPPORTED: gpu-intel-gen12
+// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
+#include <sycl/usm.hpp>
+
+constexpr size_t Size = 1024;
+using DataType = int;
+
+namespace sycl_ext = sycl::ext::oneapi::experimental;
+
+int main() {
+  sycl::queue queue;
+  DataType *a = sycl::malloc_device<DataType>(Size, queue);
+  DataType *b = sycl::malloc_device<DataType>(Size, queue);
+  std::vector<DataType> a_host(Size, 1.0);
+  std::vector<DataType> b_host(Size, -5.0);
+
+  queue.copy(a_host.data(), a, Size).wait_and_throw();
+
+  queue
+      .submit([&](sycl::handler &cgh) {
+        cgh.parallel_for(sycl::nd_range<1>({Size}, {Size}),
+                         sycl_ext::properties{sycl_ext::work_group_scratch_size(
+                             Size * sizeof(DataType))},
+                         [=](sycl::nd_item<1> it) {
+                           b[it.get_local_linear_id()] =
+                               a[it.get_local_linear_id()];
+                         });
+      })
+      .wait_and_throw();
+
+  queue.copy(b, b_host.data(), Size).wait_and_throw();
+  for (size_t i = 0; i < b_host.size(); i++) {
+    assert(b_host[i] == a_host[i]);
+  }
+  sycl::free(a, queue);
+  sycl::free(b, queue);
+}
diff --git a/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_block_scope.cpp b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_block_scope.cpp
new file mode 100644
index 0000000000000..ea1083c92be52
--- /dev/null
+++ b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_block_scope.cpp
@@ -0,0 +1,89 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+
+#include <sycl/detail/core.hpp>
+
+#include <sycl/ext/oneapi/work_group_static.hpp>
+
+#include <cassert>
+#include <vector>
+
+constexpr size_t WgSize = 32;
+constexpr size_t WgCount = 4;
+constexpr size_t Size = WgSize * WgCount;
+
+struct Foo {
+  Foo() = delete;
+  Foo(int Value, int &Counter) {
+    for (int I = 0; I < WgSize; ++I)
+      Values[I] = Value;
+    ++Counter;
+  }
+  int Values[WgSize];
+};
+
+struct Bar {
+  int Value = 42;
+};
+
+class KernelA;
+class KernelB;
+class KernelC;
+
+using namespace sycl;
+
+int main() {
+  queue Q;
+
+  {
+    std::vector<int> Vec(Size, 0);
+    buffer<int, 1> Buf{Vec.data(), range<1>(Size)};
+
+    Q.submit([&](handler &Cgh) {
+      auto Acc = Buf.get_access<access::mode::read_write>(Cgh);
+      Cgh.parallel_for<KernelA>(
+          nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) {
+            sycl::ext::oneapi::experimental::work_group_static<int[WgSize]>
+                localIDBuff;
+            localIDBuff[Item.get_local_linear_id()] =
+                Item.get_local_linear_id();
+
+            Item.barrier();
+            // Check that the memory is accessible from other work-items
+            size_t LocalIdx = Item.get_local_linear_id() ^ 1;
+            size_t GlobalIdx = Item.get_global_linear_id() ^ 1;
+            Acc[GlobalIdx] = localIDBuff[LocalIdx];
+          });
+    });
+
+    host_accessor Acc(Buf, read_only);
+    for (size_t I = 0; I < Size; ++I)
+      assert(Acc[I] == I % WgSize);
+  }
+
+  {
+    std::vector<int> Vec(Size, 0);
+    buffer<int, 1> Buf{Vec.data(), range<1>(Size)};
+
+    Q.submit([&](handler &Cgh) {
+      auto Acc = Buf.get_access<access::mode::read_write>(Cgh);
+      Cgh.parallel_for<KernelB>(
+          nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) {
+            sycl::ext::oneapi::experimental::work_group_static<int> localIDBuff;
+            int id = Item.get_global_linear_id();
+            if (Item.get_group().leader())
+              localIDBuff = id;
+
+            Item.barrier();
+            // Check that the memory is accessible from other work-items
+            size_t GlobalIdx = Item.get_global_linear_id();
+            Acc[GlobalIdx] = localIDBuff;
+          });
+    });
+
+    host_accessor Acc(Buf, read_only);
+    for (size_t I = 0; I < Size; ++I)
+      assert(Acc[I] == (I / WgSize) * WgSize);
+  }
+}
diff --git a/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_class_scope.cpp b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_class_scope.cpp
new file mode 100644
index 0000000000000..cfcde25996679
--- /dev/null
+++ b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_class_scope.cpp
@@ -0,0 +1,95 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+
+#include <sycl/detail/core.hpp>
+
+#include <sycl/ext/oneapi/work_group_static.hpp>
+
+#include <cassert>
+#include <vector>
+
+constexpr size_t WgSize = 32;
+constexpr size_t WgCount = 4;
+constexpr size_t Size = WgSize * WgCount;
+
+struct Foo {
+  Foo() = delete;
+  Foo(int Value, int &Counter) {
+    for (int I = 0; I < WgSize; ++I)
+      Values[I] = Value;
+    ++Counter;
+  }
+  int Values[WgSize];
+};
+
+struct Bar {
+  int Value = 42;
+};
+
+class KernelA;
+class KernelB;
+class KernelC;
+
+using namespace sycl;
+
+struct LocalMem {
+  // Local mem used in kernel
+  static sycl::ext::oneapi::experimental::work_group_static<int[WgSize]>
+      localIDBuff;
+};
+sycl::ext::oneapi::experimental::work_group_static<int[WgSize]>
+    LocalMem::localIDBuff;
+
+int main() {
+  queue Q;
+
+  {
+    std::vector<int> Vec(Size, 0);
+    buffer<int, 1> Buf{Vec.data(), range<1>(Size)};
+
+    Q.submit([&](handler &Cgh) {
+      auto Acc = Buf.get_access<access::mode::read_write>(Cgh);
+      Cgh.parallel_for<KernelA>(
+          nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) {
+            LocalMem::localIDBuff[Item.get_local_linear_id()] =
+                Item.get_local_linear_id();
+
+            Item.barrier();
+            // Check that the memory is accessible from other work-items
+            size_t LocalIdx = Item.get_local_linear_id() ^ 1;
+            size_t GlobalIdx = Item.get_global_linear_id() ^ 1;
+            Acc[GlobalIdx] = LocalMem::localIDBuff[LocalIdx];
+          });
+    });
+
+    host_accessor Acc(Buf, read_only);
+    for (size_t I = 0; I < Size; ++I)
+      assert(Acc[I] == I % WgSize);
+  }
+
+  {
+    std::vector<int> Vec(Size, 0);
+    buffer<int, 1> Buf{Vec.data(), range<1>(Size)};
+
+    Q.submit([&](handler &Cgh) {
+      auto Acc = Buf.get_access<access::mode::read_write>(Cgh);
+      Cgh.parallel_for<KernelB>(
+          nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) {
+            sycl::ext::oneapi::experimental::work_group_static<int> localIDBuff;
+            int id = Item.get_global_linear_id();
+            if (Item.get_group().leader())
+              localIDBuff = id;
+
+            Item.barrier();
+            // Check that the memory is accessible from other work-items
+            size_t GlobalIdx = Item.get_global_linear_id();
+            Acc[GlobalIdx] = localIDBuff;
+          });
+    });
+
+    host_accessor Acc(Buf, read_only);
+    for (size_t I = 0; I < Size; ++I)
+      assert(Acc[I] == (I / WgSize) * WgSize);
+  }
+}
diff --git a/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_namespace_scope.cpp b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_namespace_scope.cpp
new file mode 100644
index 0000000000000..15996f8753544
--- /dev/null
+++ b/sycl/test-e2e/WorkGroupMemory/Static/work_group_static_memory_namespace_scope.cpp
@@ -0,0 +1,90 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+
+#include <sycl/detail/core.hpp>
+
+#include <sycl/ext/oneapi/work_group_static.hpp>
+
+#include <cassert>
+#include <vector>
+
+constexpr size_t WgSize = 32;
+constexpr size_t WgCount = 4;
+constexpr size_t Size = WgSize * WgCount;
+
+struct Foo {
+  Foo() = delete;
+  Foo(int Value, int &Counter) {
+    for (int I = 0; I < WgSize; ++I)
+      Values[I] = Value;
+    ++Counter;
+  }
+  int Values[WgSize];
+};
+
+struct Bar {
+  int Value = 42;
+};
+
+class KernelA;
+class KernelB;
+class KernelC;
+
+using namespace sycl;
+
+// Local mem used in kernel
+sycl::ext::oneapi::experimental::work_group_static<int[WgSize]> localIDBuff;
+
+int main() {
+  queue Q;
+
+  {
+    std::vector<int> Vec(Size, 0);
+    buffer<int, 1> Buf{Vec.data(), range<1>(Size)};
+
+    Q.submit([&](handler &Cgh) {
+      auto Acc = Buf.get_access<access::mode::read_write>(Cgh);
+      Cgh.parallel_for<KernelA>(
+          nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) {
+            localIDBuff[Item.get_local_linear_id()] =
+                Item.get_local_linear_id();
+
+            Item.barrier();
+            // Check that the memory is accessible from other work-items
+            size_t LocalIdx = Item.get_local_linear_id() ^ 1;
+            size_t GlobalIdx = Item.get_global_linear_id() ^ 1;
+            Acc[GlobalIdx] = localIDBuff[LocalIdx];
+          });
+    });
+
+    host_accessor Acc(Buf, read_only);
+    for (size_t I = 0; I < Size; ++I)
+      assert(Acc[I] == I % WgSize);
+  }
+
+  {
+    std::vector<int> Vec(Size, 0);
+    buffer<int, 1> Buf{Vec.data(), range<1>(Size)};
+
+    Q.submit([&](handler &Cgh) {
+      auto Acc = Buf.get_access<access::mode::read_write>(Cgh);
+      Cgh.parallel_for<KernelB>(
+          nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) {
+            sycl::ext::oneapi::experimental::work_group_static<int> localIDBuff;
+            int id = Item.get_global_linear_id();
+            if (Item.get_group().leader())
+              localIDBuff = id;
+
+            Item.barrier();
+            // Check that the memory is accessible from other work-items
+            size_t GlobalIdx = Item.get_global_linear_id();
+            Acc[GlobalIdx] = localIDBuff;
+          });
+    });
+
+    host_accessor Acc(Buf, read_only);
+    for (size_t I = 0; I < Size; ++I)
+      assert(Acc[I] == (I / WgSize) * WgSize);
+  }
+}
diff --git a/sycl/test-e2e/WorkGroupMemory/lit.local.cfg b/sycl/test-e2e/WorkGroupMemory/lit.local.cfg
new file mode 100644
index 0000000000000..bafde197c494f
--- /dev/null
+++ b/sycl/test-e2e/WorkGroupMemory/lit.local.cfg
@@ -0,0 +1,3 @@
+
+# https://github.com/intel/llvm/issues/16072
+config.unsupported_features += ['hip']
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 3e7673d95b236..6b2d68b66e0cc 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3529,6 +3529,7 @@ _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE
 _ZN4sycl3_V17handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE
 _ZN4sycl3_V17handler22setKernelClusterLaunchENS0_5rangeILi3EEEi
 _ZN4sycl3_V17handler22setKernelIsCooperativeEb
+_ZN4sycl3_V17handler21setKernelWorkGroupMemEm
 _ZN4sycl3_V17handler24GetRangeRoundingSettingsERmS2_S2_
 _ZN4sycl3_V17handler24ext_intel_read_host_pipeENS0_6detail11string_viewEPvmb
 _ZN4sycl3_V17handler24ext_oneapi_memcpy2d_implEPvmPKvmmm
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index a84e393e5b006..208014d516ee8 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -4249,6 +4249,7 @@
 ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4StableKernelCacheConfig@123@@Z
 ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$02@23@H@Z
 ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z
+?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z
 ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z
 ?setNDRangeDescriptorPadded@handler@_V1@sycl@@AEAAXV?$range@$02@23@0V?$id@$02@23@H@Z
 ?setNDRangeDescriptorPadded@handler@_V1@sycl@@AEAAXV?$range@$02@23@V?$id@$02@23@H@Z
diff --git a/sycl/test/extensions/work_group_static/work_group_static_negative.cpp b/sycl/test/extensions/work_group_static/work_group_static_negative.cpp
new file mode 100644
index 0000000000000..c5ddbe4905439
--- /dev/null
+++ b/sycl/test/extensions/work_group_static/work_group_static_negative.cpp
@@ -0,0 +1,19 @@
+// RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=warning,note %s
+
+#include <sycl/detail/core.hpp>
+
+#include <sycl/ext/oneapi/work_group_static.hpp>
+
+using namespace sycl::ext::oneapi::experimental;
+
+class InvalidCtorDtor {
+  InvalidCtorDtor() {}
+  ~InvalidCtorDtor() {}
+};
+
+SYCL_EXTERNAL void test(int *p) {
+  // expected-error-re@sycl/ext/oneapi/work_group_static.hpp:* {{static assertion failed due to requirement {{.+}}: Can only be used with non const and non volatile types}}
+  sycl::ext::oneapi::experimental::work_group_static<const volatile int> b1;
+  // expected-error-re@sycl/ext/oneapi/work_group_static.hpp:* {{static assertion failed due to requirement {{.+}}: Can only be used with trivially constructible and destructible types}}
+  sycl::ext::oneapi::experimental::work_group_static<InvalidCtorDtor> b2;
+}
diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
index 8f1a69dadecca..c6a7bdd891344 100644
--- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp
+++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
@@ -131,6 +131,8 @@
 // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp
 // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp
 // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp
+// CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp
+// CHECK-NEXT: detail/sycl_local_mem_builtins.hpp
 // CHECK-NEXT: kernel.hpp
 // CHECK-NEXT: sampler.hpp
 // CHECK-NEXT: feature_test.hpp
diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp
index 90d95975a0245..452738b8fca86 100644
--- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp
+++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp
@@ -393,6 +393,27 @@ TEST_F(CommandGraphTest, EnqueueCustomCommandCheck) {
   ASSERT_EQ(ExceptionCode, sycl::errc::invalid);
 }
 
+// sycl_ext_oneapi_work_group_scratch_memory isn't supported with SYCL graphs
+TEST_F(CommandGraphTest, WorkGroupScratchMemoryCheck) {
+  ASSERT_THROW(
+      {
+        try {
+          Graph.add([&](handler &CGH) {
+            CGH.parallel_for(
+                range<1>{1},
+                ext::oneapi::experimental::properties{
+                    ext::oneapi::experimental::work_group_scratch_size(
+                        sizeof(int))},
+                [=](item<1> idx) {});
+          });
+        } catch (const sycl::exception &e) {
+          ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid));
+          throw;
+        }
+      },
+      sycl::exception);
+}
+
 TEST_F(CommandGraphTest, MakeEdgeErrors) {
   // Set up some nodes in the graph
   auto NodeA = Graph.add(
diff --git a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp
index 0d920590c6e15..02c007c7d27e7 100644
--- a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp
+++ b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp
@@ -140,7 +140,7 @@ class MockHandler : public sycl::handler {
           CGH->MKernelName.c_str(), std::move(CGH->MStreamStorage),
           std::move(impl->MAuxiliaryResources), impl->MCGType, {},
           impl->MKernelIsCooperative, impl->MKernelUsesClusterLaunch,
-          CGH->MCodeLoc));
+          impl->MKernelWorkGroupMemorySize, CGH->MCodeLoc));
       break;
     }
     default:
diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
index 424c9a7dbb58c..ffc0567ba7daa 100644
--- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp
+++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
@@ -302,7 +302,8 @@ class MockHandlerCustomFinalize : public MockHandler {
           std::move(impl->MKernelBundle), std::move(CGData), getArgs(),
           getKernelName(), getStreamStorage(), impl->MAuxiliaryResources,
           getType(), {}, impl->MKernelIsCooperative,
-          impl->MKernelUsesClusterLaunch, getCodeLoc()));
+          impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize,
+          getCodeLoc()));
       break;
     }
     case sycl::detail::CGType::CodeplayHostTask: {
diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
index 852d59a43e123..132baecad5448 100644
--- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
+++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
@@ -37,7 +37,7 @@ class MockHandlerStreamInit : public MockHandler {
           getArgs(), getKernelName(), getStreamStorage(),
           std::move(impl->MAuxiliaryResources), getType(), {},
           impl->MKernelIsCooperative, impl->MKernelUsesClusterLaunch,
-          getCodeLoc()));
+          impl->MKernelWorkGroupMemorySize, getCodeLoc()));
       break;
     }
     default: