From 5b2d0a01e76dfed0e6e7aac794f78dd91817cd74 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Mon, 9 Jan 2023 13:00:37 +0000
Subject: [PATCH 01/25] [SYCL][Fusion] Embed LLVM IR for SYCL for Nvidia

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 clang/include/clang/Driver/Action.h   |  7 ++++++-
 clang/include/clang/Driver/Options.td |  2 ++
 clang/lib/Driver/Action.cpp           |  6 +++---
 clang/lib/Driver/Driver.cpp           | 10 ++++++++++
 clang/lib/Driver/ToolChains/Clang.cpp | 10 +++++++++-
 5 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index 2c9f59cdde9b0..9dab2e32b2ffc 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -660,9 +660,14 @@ class OffloadUnbundlingJobAction final : public JobAction {
 class OffloadWrapperJobAction : public JobAction {
   void anchor() override;
 
+  bool EmbedIR;
+
 public:
   OffloadWrapperJobAction(ActionList &Inputs, types::ID Type);
-  OffloadWrapperJobAction(Action *Input, types::ID OutputType);
+  OffloadWrapperJobAction(Action *Input, types::ID OutputType,
+                          bool IsEmbeddedIR = false);
+
+  bool isEmbeddedIR() const { return EmbedIR; }
 
   static bool classof(const Action *A) {
     return A->getKind() == OffloadWrapperJobClass;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index aa56bc605a7a8..6afef8e821e08 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2976,6 +2976,8 @@ def fintelfpga : Flag<["-"], "fintelfpga">, Group<f_Group>,
   HelpText<"Perform ahead-of-time compilation for FPGA">;
 def fsycl_device_only : Flag<["-"], "fsycl-device-only">, Flags<[CoreOption]>,
   HelpText<"Compile SYCL kernels for device">;
+def fsycl_embed_ir : Flag<["-"], "fsycl-embed-ir">, Flags<[CoreOption]>,
+  HelpText<"Embed LLVM IR for runtime kernel fusion">;
 defm sycl_esimd_force_stateless_mem : BoolFOption<"sycl-esimd-force-stateless-mem",
     LangOpts<"SYCLESIMDForceStatelessMem">, DefaultFalse,
     PosFlag<SetTrue, [], "Enforce using stateless memory accesses. "
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index e8c29e7a2f147..f55233c59a6db 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -480,9 +480,9 @@ OffloadWrapperJobAction::OffloadWrapperJobAction(ActionList &Inputs,
                                                  types::ID Type)
   : JobAction(OffloadWrapperJobClass, Inputs, Type) {}
 
-OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input,
-                                                 types::ID Type)
-    : JobAction(OffloadWrapperJobClass, Input, Type) {}
+OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input, types::ID Type,
+                                                 bool IsEmbeddedIR)
+    : JobAction(OffloadWrapperJobClass, Input, Type), EmbedIR(IsEmbeddedIR) {}
 
 void OffloadPackagerJobAction::anchor() {}
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index ce24331d75f31..a7b15f4d8a9e5 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5694,6 +5694,16 @@ class OffloadingActionBuilder final {
             return TypedPostLinkAction;
           };
           Action *PostLinkAction = createPostLinkAction();
+          if (isNVPTX && Args.hasArg(options::OPT_fsycl_embed_ir)) {
+            // When compiling for Nvidia/CUDA devices and the user requested the
+            // IR to be embedded in the application (via option), run the output
+            // of sycl-post-link (filetable referencing LLVM Bitcode + symbols)
+            // through the offload wrapper and link the resulting object to the
+            // application.
+            auto *WrapBitcodeAction = C.MakeAction<OffloadWrapperJobAction>(
+                PostLinkAction, types::TY_Object, true);
+            DA.add(*WrapBitcodeAction, *TC, BoundArch, Action::OFK_SYCL);
+          }
           bool NoRDCFatStaticArchive =
               !IsRDC &&
               FullDeviceLinkAction->getType() == types::TY_Tempfilelist;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 42e5a64dc2993..b2883f49a5a31 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9253,6 +9253,14 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
       createArgString("-link-opts=");
     }
 
+    bool IsEmbeddedIR = cast<OffloadWrapperJobAction>(JA).isEmbeddedIR();
+    if (IsEmbeddedIR) {
+      // When the offload-wrapper is called to embed LLVM IR, add a prefix to
+      // the target triple to distinguish the LLVM IR from the actual device
+      // binary for that target.
+      TargetTripleOpt = ("llvm_" + TargetTripleOpt).str();
+    }
+
     WrapperArgs.push_back(
         C.getArgs().MakeArgString(Twine("-target=") + TargetTripleOpt));
 
@@ -9274,7 +9282,7 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
     assert(I.isFilename() && "Invalid input.");
 
     if (I.getType() == types::TY_Tempfiletable ||
-        I.getType() == types::TY_Tempfilelist)
+        I.getType() == types::TY_Tempfilelist || IsEmbeddedIR)
       // wrapper actual input files are passed via the batch job file table:
       WrapperArgs.push_back(C.getArgs().MakeArgString("-batch"));
     WrapperArgs.push_back(C.getArgs().MakeArgString(I.getFilename()));

From ee8c29b0f6d2a5e8195238f9c49dd4688004dd89 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 10 Jan 2023 13:29:22 +0000
Subject: [PATCH 02/25] Enable LLVM IR as alternative fusion input format;

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/CMakeLists.txt       |   4 +-
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |   5 +-
 .../lib/translation/LoadKernels.cpp           | 116 ++++++++++++++++++
 .../lib/translation/LoadKernels.h             |  36 ++++++
 .../lib/translation/SPIRVLLVMTranslation.cpp  |  94 +++-----------
 .../lib/translation/SPIRVLLVMTranslation.h    |  13 +-
 sycl/source/detail/jit_compiler.cpp           | 108 +++++++++++-----
 7 files changed, 251 insertions(+), 125 deletions(-)
 create mode 100644 sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
 create mode 100644 sycl-fusion/jit-compiler/lib/translation/LoadKernels.h

diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index de6a73b1eab3d..92f0cefd68634 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -2,13 +2,15 @@
 add_llvm_library(sycl-fusion
    lib/KernelFusion.cpp
    lib/JITContext.cpp
+   lib/translation/LoadKernels.cpp
    lib/translation/SPIRVLLVMTranslation.cpp
    lib/fusion/FusionPipeline.cpp
    lib/fusion/FusionHelper.cpp
    lib/fusion/ModuleHelper.cpp
    lib/helper/ConfigHelper.cpp
 
-  LINK_COMPONENTS
+   LINK_COMPONENTS
+   BitReader
    Core
    Support
    Analysis
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 56ea9401c465b..fc168587738b5 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -15,6 +15,7 @@
 #include "fusion/FusionPipeline.h"
 #include "helper/ConfigHelper.h"
 #include "helper/ErrorHandling.h"
+#include "translation/LoadKernels.h"
 #include "translation/SPIRVLLVMTranslation.h"
 #include <llvm/Support/Error.h>
 #include <sstream>
@@ -97,8 +98,8 @@ FusionResult KernelFusion::fuseKernels(
   // Load all input kernels from their respective SPIR-V modules into a single
   // LLVM IR module.
   llvm::Expected<std::unique_ptr<llvm::Module>> ModOrError =
-      translation::SPIRVLLVMTranslator::loadSPIRVKernels(
-          *JITCtx.getLLVMContext(), ModuleInfo.kernels());
+      translation::KernelLoader::loadKernels(*JITCtx.getLLVMContext(),
+                                             ModuleInfo.kernels());
   if (auto Error = ModOrError.takeError()) {
     return errorToFusionResult(std::move(Error), "SPIR-V translation failed");
   }
diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
new file mode 100644
index 0000000000000..cabf1f6c44e9f
--- /dev/null
+++ b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
@@ -0,0 +1,116 @@
+//==-------------------------- LoadKernels.cpp  ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoadKernels.h"
+#include "SPIRVLLVMTranslation.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace jit_compiler;
+using namespace jit_compiler::translation;
+using namespace llvm;
+
+llvm::Expected<std::unique_ptr<llvm::Module>>
+KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx,
+                          std::vector<SYCLKernelInfo> &Kernels) {
+  std::unique_ptr<Module> Result{nullptr};
+  bool First = true;
+  DenseSet<BinaryBlob> ParsedBinaries;
+  size_t AddressBits = 0;
+  for (auto &Kernel : Kernels) {
+    // FIXME: Currently, we use the front of the list.
+    // Do we need to iterate to find the most suitable
+    // SPIR-V module?
+    SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo;
+
+    const unsigned char *ModulePtr = BinInfo.BinaryStart;
+    size_t ModuleSize = BinInfo.BinarySize;
+    BinaryBlob BinBlob{ModulePtr, ModuleSize};
+    if (ParsedBinaries.contains(BinBlob)) {
+      // Multiple kernels can be stored in the same SPIR-V or LLVM IR module.
+      // If we encountered the same binary module before, skip.
+      // NOTE: We compare the pointer as well as the size, in case
+      // a previous kernel only referenced part of the SPIR-V/LLVM IR module.
+      // Not sure this can actually happen, but better safe than sorry.
+      continue;
+    }
+    // Simply load and translate the SPIR-V into the currently still empty
+    // module.
+    std::unique_ptr<llvm::Module> NewMod;
+
+    switch (BinInfo.Format) {
+    case BinaryFormat::LLVM: {
+      auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel);
+      if (auto Err = ModOrError.takeError()) {
+        return std::move(Err);
+      }
+      NewMod = std::move(*ModOrError);
+      break;
+    }
+    case BinaryFormat::SPIRV: {
+      auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel);
+      if (auto Err = ModOrError.takeError()) {
+        return std::move(Err);
+      }
+      NewMod = std::move(*ModOrError);
+      break;
+    }
+    default: {
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Failed to load kernel from unsupported input format");
+    }
+    }
+
+    // We do not assume that the input binary information has the address bits
+    // set, but rather retrieve this information from the SPIR-V/LLVM module's
+    // data-layout.
+    BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
+
+    if (First) {
+      // We can simply assign the module we just loaded from SPIR-V to the
+      // empty pointer on the first iteration.
+      Result = std::move(NewMod);
+      // The first module will dictate the address bits for the remaining.
+      AddressBits = BinInfo.AddressBits;
+      First = false;
+    } else {
+      // We have already loaded some module, so now we need to
+      // link the module we just loaded with the result so far.
+      // FIXME: We allow duplicates to be overridden by the module
+      // read last. This could cause problems if different modules contain
+      // definitions with the same name, but different body/content.
+      // Check that this is not problematic.
+      Linker::linkModules(*Result, std::move(NewMod),
+                          Linker::Flags::OverrideFromSrc);
+      if (AddressBits != BinInfo.AddressBits) {
+        return createStringError(
+            inconvertibleErrorCode(),
+            "Number of address bits between SPIR-V modules does not match");
+      }
+    }
+  }
+  return std::move(Result);
+}
+
+llvm::Expected<std::unique_ptr<llvm::Module>>
+KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
+                             SYCLKernelInfo &Kernel) {
+  auto &BinInfo = Kernel.BinaryInfo;
+  llvm::StringRef RawData(reinterpret_cast<const char *>(BinInfo.BinaryStart),
+                          BinInfo.BinarySize);
+  return llvm::parseBitcodeFile(
+      MemoryBuffer::getMemBuffer(RawData)->getMemBufferRef(), LLVMCtx);
+}
+
+llvm::Expected<std::unique_ptr<llvm::Module>>
+KernelLoader::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
+                              SYCLKernelInfo &Kernel) {
+  return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel);
+}
diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h
new file mode 100644
index 0000000000000..5720abf09bfd7
--- /dev/null
+++ b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h
@@ -0,0 +1,36 @@
+//==-- LoadKernels.h - Load LLVM IR for SYCL kernels in different formats  -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Kernel.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include <llvm/Support/Error.h>
+#include <vector>
+
+namespace jit_compiler {
+namespace translation {
+
+class KernelLoader {
+
+public:
+  static llvm::Expected<std::unique_ptr<llvm::Module>>
+  loadKernels(llvm::LLVMContext &LLVMCtx, std::vector<SYCLKernelInfo> &Kernels);
+
+private:
+  ///
+  /// Pair of address and size to represent a binary blob.
+  using BinaryBlob = std::pair<BinaryAddress, size_t>;
+
+  static llvm::Expected<std::unique_ptr<llvm::Module>>
+  loadLLVMKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
+
+  static llvm::Expected<std::unique_ptr<llvm::Module>>
+  loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
+};
+} // namespace translation
+} // namespace jit_compiler
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
index 77217419e98bd..fc481f23b141d 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
@@ -16,7 +16,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/Support/raw_ostream.h"
 #include <iostream>
 #include <sstream>
@@ -86,12 +85,19 @@ SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() {
   return Opts;
 }
 
-Expected<std::unique_ptr<Module>>
-SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx,
-                                           BinaryBlob Input) {
-  // Create an input stream for the binary blob.
+Expected<std::unique_ptr<llvm::Module>>
+SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
+                                     SYCLKernelInfo &Kernel) {
+  std::unique_ptr<Module> Result{nullptr};
+
+  SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo;
+  assert(BinInfo.Format == BinaryFormat::SPIRV &&
+         "Only SPIR-V supported as input");
+
+  // Create an input stream for the SPIR-V binary.
   std::stringstream SPIRStream(
-      std::string(reinterpret_cast<const char *>(Input.first), Input.second),
+      std::string(reinterpret_cast<const char *>(BinInfo.BinaryStart),
+                  BinInfo.BinarySize),
       std::ios_base::in | std::ios_base::binary);
   std::string ErrMsg;
   // Create a raw pointer. readSpirv accepts a reference to a pointer,
@@ -105,77 +111,13 @@ SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx,
         "Failed to load and translate SPIR-V module with error %s",
         ErrMsg.c_str());
   }
-  return std::unique_ptr<Module>(LLVMMod);
-}
+  std::unique_ptr<Module> NewMod{LLVMMod};
 
-Expected<std::unique_ptr<llvm::Module>>
-SPIRVLLVMTranslator::loadSPIRVKernels(llvm::LLVMContext &LLVMCtx,
-                                      std::vector<SYCLKernelInfo> &Kernels) {
-  std::unique_ptr<Module> Result{nullptr};
-  bool First = true;
-  DenseSet<BinaryBlob> ParsedSPIRVModules;
-  size_t AddressBits = 0;
-  for (auto &Kernel : Kernels) {
-    // FIXME: Currently, we use the front of the list.
-    // Do we need to iterate to find the most suitable
-    // SPIR-V module?
-    SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo;
-    // TODO(Lukas, ONNX-399): Also support LLVM IR as input but simply skipping
-    // the translation from SPIR-V to LLVM.
-    assert(BinInfo.Format == BinaryFormat::SPIRV &&
-           "Only SPIR-V supported as input");
-    const unsigned char *SPRModulePtr = BinInfo.BinaryStart;
-    size_t SPRModuleSize = BinInfo.BinarySize;
-    BinaryBlob BinBlob{SPRModulePtr, SPRModuleSize};
-    if (ParsedSPIRVModules.contains(BinBlob)) {
-      // Multiple kernels can be stored in the same SPIR-V module.
-      // If we encountered the same SPIR-V module before, skip.
-      // NOTE: We compare the pointer as well as the size, in case
-      // a previous kernel only referenced part of the SPIR-V module.
-      // Not sure this can actually happen, but better safe than sorry.
-      continue;
-    }
-    // Simply load and translate the SPIR-V into the currently still empty
-    // module.
-    PROPAGATE_ERROR(NewMod, readAndTranslateSPIRV(LLVMCtx, BinBlob));
-
-    // We do not assume that the input binary information has the address bits
-    // set, but rather retrieve this information from the SPIR-V/LLVM module's
-    // data-layout.
-    BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
-    assert((First || BinInfo.AddressBits == AddressBits) &&
-           "Address bits do not match");
-    // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
-    // 'work_group_size_hint' from metadata attached to the kernel function and
-    // store it in the SYCLKernelInfo.
-    // TODO(Lukas, ONNX-399): Validate that DPC++ used metadata to represent
-    // that information.
-    restoreKernelAttributes(NewMod.get(), Kernel);
-
-    if (First) {
-      // We can simply assign the module we just loaded from SPIR-V to the
-      // empty pointer on the first iteration.
-      Result = std::move(NewMod);
-      // The first module will dictate the address bits for the remaining.
-      AddressBits = BinInfo.AddressBits;
-      First = false;
-    } else {
-      // We have already loaded some module, so now we need to
-      // link the module we just loaded with the result so far.
-      // FIXME: We allow duplicates to be overridden by the module
-      // read last. This could cause problems if different modules contain
-      // definitions with the same name, but different body/content.
-      // Check that this is not problematic.
-      Linker::linkModules(*Result, std::move(NewMod),
-                          Linker::Flags::OverrideFromSrc);
-      if (AddressBits != BinInfo.AddressBits) {
-        return createStringError(
-            inconvertibleErrorCode(),
-            "Number of address bits between SPIR-V modules does not match");
-      }
-    }
-  }
-  return std::move(Result);
+  // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
+  // 'work_group_size_hint' from metadata attached to the kernel function and
+  // store it in the SYCLKernelInfo.
+  restoreKernelAttributes(NewMod.get(), Kernel);
+  return std::move(NewMod);
 }
 
 Expected<jit_compiler::SPIRVBinary *>
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
index 5f1d416e45150..d82a9cc82466c 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
@@ -25,8 +25,7 @@ class SPIRVLLVMTranslator {
   ///
   /// Load a list of SPIR-V kernels into a single LLVM module.
   static llvm::Expected<std::unique_ptr<llvm::Module>>
-  loadSPIRVKernels(llvm::LLVMContext &LLVMCtx,
-                   std::vector<SYCLKernelInfo> &Kernels);
+  loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
 
   ///
   /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and
@@ -35,10 +34,6 @@ class SPIRVLLVMTranslator {
                                                             JITContext &JITCtx);
 
 private:
-  ///
-  /// Pair of address and size to represent a binary blob.
-  using BinaryBlob = std::pair<BinaryAddress, size_t>;
-
   ///
   /// Get an attribute value consisting of NumValues scalar constant integers
   /// from the MDNode.
@@ -53,12 +48,6 @@ class SPIRVLLVMTranslator {
   ///   - work_group_size_hint
   static void restoreKernelAttributes(llvm::Module *Mod, SYCLKernelInfo &Info);
 
-  ///
-  /// Read the given SPIR-V binary and translate it to a new LLVM module
-  /// associated with the given context.
-  static llvm::Expected<std::unique_ptr<llvm::Module>>
-  readAndTranslateSPIRV(llvm::LLVMContext &LLVMCtx, BinaryBlob Input);
-
   ///
   /// Default settings for the SPIRV translation options.
   static SPIRV::TranslatorOpts &translatorOpts();
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 4e092ef12c4e3..d8a59a58f8a36 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -39,6 +39,70 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
   }
 }
 
+std::pair<const RTDeviceBinaryImage *, RT::PiProgram>
+retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
+  auto KernelName = KernelCG->getKernelName();
+
+  bool isNvidia = Queue->getDeviceImplPtr()->getPlugin().getBackend() ==
+                  backend::ext_oneapi_cuda;
+  if (isNvidia) {
+    auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
+    std::vector<kernel_id> KernelIds;
+    KernelIds.push_back(KernelID);
+    auto DeviceImages =
+        ProgramManager::getInstance().getRawDeviceImages(KernelIds);
+    const RTDeviceBinaryImage *DeviceImage = nullptr;
+    for (auto *DI : DeviceImages) {
+      // We are looking for a device image with LLVM IR format and target spec
+      // "llvm_nvptx64", which has been set by the offload-wrapper action.
+      if (DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
+          DI->getRawData().DeviceTargetSpec == std::string("llvm_nvptx64")) {
+        DeviceImage = DI;
+        break;
+      }
+    }
+    if (!DeviceImage) {
+      return {nullptr, nullptr};
+    }
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    RT::PiProgram Program =
+        detail::ProgramManager::getInstance().createPIProgram(*DeviceImage,
+                                                              Context, Device);
+    return {DeviceImage, Program};
+  }
+
+  const RTDeviceBinaryImage *DeviceImage = nullptr;
+  RT::PiProgram Program = nullptr;
+  if (KernelCG->getKernelBundle() != nullptr) {
+    // Retrieve the device image from the kernel bundle.
+    auto KernelBundle = KernelCG->getKernelBundle();
+    kernel_id KernelID =
+        detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
+
+    auto SyclKernel = detail::getSyclObjImpl(
+        KernelBundle->get_kernel(KernelID, KernelBundle));
+
+    DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = SyclKernel->getDeviceImage()->get_program_ref();
+  } else if (KernelCG->MSyclKernel != nullptr) {
+    DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
+  } else {
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
+        KernelCG->MOSModuleHandle, KernelName, Context, Device);
+    Program = detail::ProgramManager::getInstance().createPIProgram(
+        *DeviceImage, Context, Device);
+  }
+  return {DeviceImage, Program};
+}
+
 static ::jit_compiler::ParameterKind
 translateArgType(kernel_param_kind_t Kind) {
   using PK = ::jit_compiler::ParameterKind;
@@ -576,43 +640,20 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
           "Cannot fuse kernel with invalid kernel function name");
       return nullptr;
     }
-    const RTDeviceBinaryImage *DeviceImage = nullptr;
-    RT::PiProgram Program = nullptr;
+
+    auto [DeviceImage, Program] = retrieveKernelBinary(Queue, KernelCG);
+
+    if (!DeviceImage || !Program) {
+      printPerformanceWarning("No suitable IR available for fusion");
+      return nullptr;
+    }
     const KernelArgMask *EliminatedArgs = nullptr;
-    if (KernelCG->getKernelBundle() != nullptr) {
-      // Retrieve the device image from the kernel bundle.
-      auto KernelBundle = KernelCG->getKernelBundle();
-      kernel_id KernelID =
-          detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
-
-      auto SyclKernel = detail::getSyclObjImpl(
-          KernelBundle->get_kernel(KernelID, KernelBundle));
-
-      DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
-      Program = SyclKernel->getDeviceImage()->get_program_ref();
-      EliminatedArgs = SyclKernel->getKernelArgMask();
-    } else if (KernelCG->MSyclKernel != nullptr) {
-      DeviceImage =
-          KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
-      Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
-      EliminatedArgs = KernelCG->MSyclKernel->getKernelArgMask();
-    } else {
-      auto ContextImpl = Queue->getContextImplPtr();
-      auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
-      auto DeviceImpl = Queue->getDeviceImplPtr();
-      auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
-      DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
-          KernelCG->MOSModuleHandle, KernelName, Context, Device);
-      Program = detail::ProgramManager::getInstance().createPIProgram(
-          *DeviceImage, Context, Device);
+    if (Program && (KernelCG->MSyclKernel == nullptr ||
+                    !KernelCG->MSyclKernel->isCreatedFromSource())) {
       EliminatedArgs =
           detail::ProgramManager::getInstance().getEliminatedKernelArgMask(
               KernelCG->MOSModuleHandle, Program, KernelName);
     }
-    if (!DeviceImage || !Program) {
-      printPerformanceWarning("No suitable IR available for fusion");
-      return nullptr;
-    }
 
     // Collect information about the arguments of this kernel.
 
@@ -666,8 +707,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       return nullptr;
     }
     ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
-        translateBinaryImageFormat(DeviceImage->getFormat()), 0,
-        RawDeviceImage.BinaryStart, DeviceImageSize};
+        BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
 
     constexpr auto SYCLTypeToIndices = [](auto Val) -> ::jit_compiler::Indices {
       return {Val.get(0), Val.get(1), Val.get(2)};

From 5d1d778b1bb68593c67b7e17ec2c945ea21651b0 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 11 Jan 2023 14:02:59 +0000
Subject: [PATCH 03/25] [SYCL][Fusion] Add result translation to PTX

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/common/include/Kernel.h           |   2 +-
 sycl-fusion/common/lib/KernelIO.h             |   1 +
 sycl-fusion/jit-compiler/CMakeLists.txt       |   6 +-
 sycl-fusion/jit-compiler/include/JITContext.h |  16 ++-
 sycl-fusion/jit-compiler/include/Options.h    |   6 +-
 sycl-fusion/jit-compiler/lib/JITContext.cpp   |  14 +-
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |  18 +--
 ...{LoadKernels.cpp => KernelTranslation.cpp} | 121 ++++++++++++++++--
 .../{LoadKernels.h => KernelTranslation.h}    |  14 +-
 .../lib/translation/SPIRVLLVMTranslation.cpp  |   4 +-
 .../lib/translation/SPIRVLLVMTranslation.h    |   2 +-
 sycl/source/detail/jit_compiler.cpp           |  17 +++
 12 files changed, 186 insertions(+), 35 deletions(-)
 rename sycl-fusion/jit-compiler/lib/translation/{LoadKernels.cpp => KernelTranslation.cpp} (50%)
 rename sycl-fusion/jit-compiler/lib/translation/{LoadKernels.h => KernelTranslation.h} (64%)

diff --git a/sycl-fusion/common/include/Kernel.h b/sycl-fusion/common/include/Kernel.h
index 1962dd042ffbe..87726b3368d63 100644
--- a/sycl-fusion/common/include/Kernel.h
+++ b/sycl-fusion/common/include/Kernel.h
@@ -34,7 +34,7 @@ enum class ParameterKind : uint32_t {
 };
 
 /// Different binary formats supported as input to the JIT compiler.
-enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV };
+enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX };
 
 /// Information about a device intermediate representation module (e.g., SPIR-V,
 /// LLVM IR) from DPC++.
diff --git a/sycl-fusion/common/lib/KernelIO.h b/sycl-fusion/common/lib/KernelIO.h
index 12c194f8b4dd4..09058d61e9981 100644
--- a/sycl-fusion/common/lib/KernelIO.h
+++ b/sycl-fusion/common/lib/KernelIO.h
@@ -47,6 +47,7 @@ template <> struct ScalarEnumerationTraits<jit_compiler::BinaryFormat> {
   static void enumeration(IO &IO, jit_compiler::BinaryFormat &BF) {
     IO.enumCase(BF, "LLVM", jit_compiler::BinaryFormat::LLVM);
     IO.enumCase(BF, "SPIRV", jit_compiler::BinaryFormat::SPIRV);
+    IO.enumCase(BF, "PTX", jit_compiler::BinaryFormat::PTX);
     IO.enumCase(BF, "INVALID", jit_compiler::BinaryFormat::INVALID);
   }
 };
diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index 92f0cefd68634..f67eed5fc6c76 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -2,7 +2,7 @@
 add_llvm_library(sycl-fusion
    lib/KernelFusion.cpp
    lib/JITContext.cpp
-   lib/translation/LoadKernels.cpp
+   lib/translation/KernelTranslation.cpp
    lib/translation/SPIRVLLVMTranslation.cpp
    lib/fusion/FusionPipeline.cpp
    lib/fusion/FusionHelper.cpp
@@ -20,6 +20,10 @@ add_llvm_library(sycl-fusion
    Linker
    ScalarOpts
    InstCombine
+   Target
+   NVPTX
+   X86
+   MC
 )
 
 target_include_directories(sycl-fusion
diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index 4c0616e267941..d4654f19820ce 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -36,17 +36,21 @@ using CacheKeyT =
                std::optional<std::vector<NDRange>>>;
 
 ///
-/// Wrapper around a SPIR-V binary.
-class SPIRVBinary {
+/// Wrapper around a kernel binary.
+class KernelBinary {
 public:
-  explicit SPIRVBinary(std::string Binary);
+  explicit KernelBinary(std::string Binary, BinaryFormat Format);
 
   jit_compiler::BinaryAddress address() const;
 
   size_t size() const;
 
+  BinaryFormat format() const;
+
 private:
   std::string Blob;
+
+  BinaryFormat Format;
 };
 
 ///
@@ -61,7 +65,8 @@ class JITContext {
 
   llvm::LLVMContext *getLLVMContext();
 
-  SPIRVBinary &emplaceSPIRVBinary(std::string Binary);
+  KernelBinary &emplaceSPIRVBinary(std::string Binary,
+                                   BinaryFormat Format);
 
   std::optional<SYCLKernelInfo> getCacheEntry(CacheKeyT &Identifier) const;
 
@@ -79,11 +84,12 @@ class JITContext {
 
   MutexT BinariesMutex;
 
-  std::vector<SPIRVBinary> Binaries;
+  std::vector<KernelBinary> Binaries;
 
   mutable MutexT CacheMutex;
 
   std::unordered_map<CacheKeyT, SYCLKernelInfo> Cache;
+  
 };
 } // namespace jit_compiler
 
diff --git a/sycl-fusion/jit-compiler/include/Options.h b/sycl-fusion/jit-compiler/include/Options.h
index 335f58fb64cf7..4fe7787df00db 100644
--- a/sycl-fusion/jit-compiler/include/Options.h
+++ b/sycl-fusion/jit-compiler/include/Options.h
@@ -9,12 +9,13 @@
 #ifndef SYCL_FUSION_JIT_COMPILER_OPTIONS_H
 #define SYCL_FUSION_JIT_COMPILER_OPTIONS_H
 
+#include "Kernel.h"
 #include <memory>
 #include <unordered_map>
 
 namespace jit_compiler {
 
-enum OptionID { VerboseOutput, EnableCaching };
+enum OptionID { VerboseOutput, EnableCaching, TargetFormat };
 
 class OptionPtrBase {};
 
@@ -78,6 +79,9 @@ struct JITEnableVerbose : public OptionBase<OptionID::VerboseOutput, bool> {};
 
 struct JITEnableCaching : public OptionBase<OptionID::EnableCaching, bool> {};
 
+struct JITTargetFormat
+    : public OptionBase<OptionID::TargetFormat, BinaryFormat> {};
+
 } // namespace option
 } // namespace jit_compiler
 
diff --git a/sycl-fusion/jit-compiler/lib/JITContext.cpp b/sycl-fusion/jit-compiler/lib/JITContext.cpp
index 68c7031b9d8a9..e1dda8b928c45 100644
--- a/sycl-fusion/jit-compiler/lib/JITContext.cpp
+++ b/sycl-fusion/jit-compiler/lib/JITContext.cpp
@@ -11,14 +11,17 @@
 
 using namespace jit_compiler;
 
-SPIRVBinary::SPIRVBinary(std::string Binary) : Blob{std::move(Binary)} {}
+KernelBinary::KernelBinary(std::string Binary, BinaryFormat Fmt)
+    : Blob{std::move(Binary)}, Format{Fmt} {}
 
-jit_compiler::BinaryAddress SPIRVBinary::address() const {
+jit_compiler::BinaryAddress KernelBinary::address() const {
   // FIXME: Verify it's a good idea to perform this reinterpret_cast here.
   return reinterpret_cast<jit_compiler::BinaryAddress>(Blob.c_str());
 }
 
-size_t SPIRVBinary::size() const { return Blob.size(); }
+size_t KernelBinary::size() const { return Blob.size(); }
+
+BinaryFormat KernelBinary::format() const { return Format; }
 
 JITContext::JITContext() : LLVMCtx{new llvm::LLVMContext}, Binaries{} {}
 
@@ -26,11 +29,12 @@ JITContext::~JITContext() = default;
 
 llvm::LLVMContext *JITContext::getLLVMContext() { return LLVMCtx.get(); }
 
-SPIRVBinary &JITContext::emplaceSPIRVBinary(std::string Binary) {
+KernelBinary &JITContext::emplaceSPIRVBinary(std::string Binary,
+                                             BinaryFormat Format) {
   WriteLockT WriteLock{BinariesMutex};
   // NOTE: With C++17, which returns a reference from emplace_back, the
   // following code would be even simpler.
-  Binaries.emplace_back(std::move(Binary));
+  Binaries.emplace_back(std::move(Binary), Format);
   return Binaries.back();
 }
 
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index fc168587738b5..eeffb0af6ced8 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -15,7 +15,7 @@
 #include "fusion/FusionPipeline.h"
 #include "helper/ConfigHelper.h"
 #include "helper/ErrorHandling.h"
-#include "translation/LoadKernels.h"
+#include "translation/KernelTranslation.h"
 #include "translation/SPIRVLLVMTranslation.h"
 #include <llvm/Support/Error.h>
 #include <sstream>
@@ -98,8 +98,8 @@ FusionResult KernelFusion::fuseKernels(
   // Load all input kernels from their respective SPIR-V modules into a single
   // LLVM IR module.
   llvm::Expected<std::unique_ptr<llvm::Module>> ModOrError =
-      translation::KernelLoader::loadKernels(*JITCtx.getLLVMContext(),
-                                             ModuleInfo.kernels());
+      translation::KernelTranslator::loadKernels(*JITCtx.getLLVMContext(),
+                                                 ModuleInfo.kernels());
   if (auto Error = ModOrError.takeError()) {
     return errorToFusionResult(std::move(Error), "SPIR-V translation failed");
   }
@@ -137,14 +137,14 @@ FusionResult KernelFusion::fuseKernels(
 
   SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName);
 
-  // Translate the LLVM IR module resulting from the fusion pass into SPIR-V.
-  llvm::Expected<jit_compiler::SPIRVBinary *> BinaryOrError =
-      translation::SPIRVLLVMTranslator::translateLLVMtoSPIRV(*NewMod, JITCtx);
-  if (auto Error = BinaryOrError.takeError()) {
+  // TODO
+  BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
+
+  if (auto Error = translation::KernelTranslator::translateKernel(
+          FusedKernelInfo, *NewMod, JITCtx, TargetFormat)) {
     return errorToFusionResult(std::move(Error),
-                               "Translation to SPIR-V failed");
+                               "Translation to output format failed");
   }
-  jit_compiler::SPIRVBinary *SPIRVBin = *BinaryOrError;
 
   FusedKernelInfo.NDR = FusedKernel.FusedNDRange;
 
diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
similarity index 50%
rename from sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
rename to sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index cabf1f6c44e9f..31fa850d58d63 100644
--- a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -1,4 +1,4 @@
-//==-------------------------- LoadKernels.cpp  ----------------------------==//
+//==----------------------- KernelTranslation.cpp  -------------------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,19 +6,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LoadKernels.h"
+#include "KernelTranslation.h"
 #include "SPIRVLLVMTranslation.h"
 #include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Linker/Linker.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 
 using namespace jit_compiler;
 using namespace jit_compiler::translation;
 using namespace llvm;
 
 llvm::Expected<std::unique_ptr<llvm::Module>>
-KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx,
-                          std::vector<SYCLKernelInfo> &Kernels) {
+KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx,
+                              std::vector<SYCLKernelInfo> &Kernels) {
   std::unique_ptr<Module> Result{nullptr};
   bool First = true;
   DenseSet<BinaryBlob> ParsedBinaries;
@@ -100,8 +105,8 @@ KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx,
 }
 
 llvm::Expected<std::unique_ptr<llvm::Module>>
-KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
-                             SYCLKernelInfo &Kernel) {
+KernelTranslator::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
+                                 SYCLKernelInfo &Kernel) {
   auto &BinInfo = Kernel.BinaryInfo;
   llvm::StringRef RawData(reinterpret_cast<const char *>(BinInfo.BinaryStart),
                           BinInfo.BinarySize);
@@ -110,7 +115,107 @@ KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
 }
 
 llvm::Expected<std::unique_ptr<llvm::Module>>
-KernelLoader::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
-                              SYCLKernelInfo &Kernel) {
+KernelTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
+                                  SYCLKernelInfo &Kernel) {
   return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel);
 }
+
+llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
+                                              llvm::Module &Mod,
+                                              JITContext &JITCtx,
+                                              BinaryFormat Format) {
+
+  KernelBinary *KernelBin = nullptr;
+  switch (Format) {
+  case BinaryFormat::SPIRV: {
+    llvm::Expected<KernelBinary *> BinaryOrError =
+        translateToSPIRV(Mod, JITCtx);
+    if (auto Error = BinaryOrError.takeError()) {
+      return Error;
+    }
+    KernelBin = *BinaryOrError;
+    break;
+  }
+  case BinaryFormat::PTX: {
+    llvm::Expected<KernelBinary *> BinaryOrError = translateToPTX(Mod, JITCtx);
+    if (auto Error = BinaryOrError.takeError()) {
+      return Error;
+    }
+    KernelBin = *BinaryOrError;
+    break;
+  }
+  default: {
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Failed to translate kernel to unsupported output format");
+  }
+  }
+
+  // Update the KernelInfo for the fused kernel with the address and size of the
+  // SPIR-V binary resulting from translation.
+  SYCLKernelBinaryInfo &FusedBinaryInfo = Kernel.BinaryInfo;
+  FusedBinaryInfo.Format = Format;
+  // Output SPIR-V should use the same number of address bits as the input
+  // SPIR-V. SPIR-V translation requires all modules to use the same number of
+  // address bits, so it's safe to take the value from the first one.
+  FusedBinaryInfo.AddressBits = Mod.getDataLayout().getPointerSizeInBits();
+  FusedBinaryInfo.BinaryStart = KernelBin->address();
+  FusedBinaryInfo.BinarySize = KernelBin->size();
+  return Error::success();
+}
+
+llvm::Expected<KernelBinary *>
+KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) {
+  return SPIRVLLVMTranslator::translateLLVMtoSPIRV(Mod, JITCtx);
+}
+
+llvm::Expected<KernelBinary *>
+KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
+  // FIXME: Can we limit this to the NVPTX specific target?
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllTargetMCs();
+
+  std::string TargetTriple{"nvptx64-nvidia-cuda"};
+
+  std::string ErrorMessage;
+  const auto *Target =
+      llvm::TargetRegistry::lookupTarget(TargetTriple, ErrorMessage);
+
+  if (!Target) {
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Failed to load and translate SPIR-V module with error %s",
+        ErrorMessage.c_str());
+  }
+
+  // FIXME: Check whether we can provide more accurate target information here
+  auto *TargetMachine = Target->createTargetMachine(
+      TargetTriple, "sm_50", "+sm_50,+ptx76", {}, llvm::Reloc::PIC_,
+      std::nullopt, llvm::CodeGenOpt::Default);
+
+  llvm::legacy::PassManager PM;
+
+  std::string PTXASM;
+
+  {
+    llvm::raw_string_ostream ASMStream{PTXASM};
+    llvm::buffer_ostream BufferedASM{ASMStream};
+
+    if (TargetMachine->addPassesToEmitFile(PM, BufferedASM, nullptr,
+                                           llvm::CGFT_AssemblyFile)) {
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Failed to construct pass pipeline to emit output");
+    }
+
+    PM.run(Mod);
+    ASMStream.flush();
+  }
+
+  llvm::dbgs() << "PTX size: " << PTXASM.size() << "\n";
+  llvm::dbgs() << "PTX:\n" << PTXASM << "\n";
+
+  return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX);
+}
diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
similarity index 64%
rename from sycl-fusion/jit-compiler/lib/translation/LoadKernels.h
rename to sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
index 5720abf09bfd7..3d6824edbdd37 100644
--- a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -1,4 +1,4 @@
-//==-- LoadKernels.h - Load LLVM IR for SYCL kernels in different formats  -==//
+//==- KernelTranslation - Translate SYCL kernels between different formats -==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "JITContext.h"
 #include "Kernel.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -15,12 +16,15 @@
 namespace jit_compiler {
 namespace translation {
 
-class KernelLoader {
+class KernelTranslator {
 
 public:
   static llvm::Expected<std::unique_ptr<llvm::Module>>
   loadKernels(llvm::LLVMContext &LLVMCtx, std::vector<SYCLKernelInfo> &Kernels);
 
+  static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod,
+                              JITContext &JITCtx, BinaryFormat Format);
+
 private:
   ///
   /// Pair of address and size to represent a binary blob.
@@ -31,6 +35,12 @@ class KernelLoader {
 
   static llvm::Expected<std::unique_ptr<llvm::Module>>
   loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel);
+
+  static llvm::Expected<KernelBinary *> translateToSPIRV(llvm::Module &Mod,
+                                                         JITContext &JITCtx);
+
+  static llvm::Expected<KernelBinary *> translateToPTX(llvm::Module &Mod,
+                                                       JITContext &JITCtx);
 };
 } // namespace translation
 } // namespace jit_compiler
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
index fc481f23b141d..1046fd6dc2907 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
@@ -120,7 +120,7 @@ SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
   return std::move(NewMod);
 }
 
-Expected<jit_compiler::SPIRVBinary *>
+Expected<jit_compiler::KernelBinary *>
 SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) {
   std::ostringstream BinaryStream;
   std::string ErrMsg;
@@ -131,5 +131,5 @@ SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) {
         "Translation of LLVM IR to SPIR-V failed with error %s",
         ErrMsg.c_str());
   }
-  return &JITCtx.emplaceSPIRVBinary(BinaryStream.str());
+  return &JITCtx.emplaceSPIRVBinary(BinaryStream.str(), BinaryFormat::SPIRV);
 }
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
index d82a9cc82466c..440c00103b0d5 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
@@ -30,7 +30,7 @@ class SPIRVLLVMTranslator {
   ///
   /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and
   /// return a pointer to its container.
-  static llvm::Expected<SPIRVBinary *> translateLLVMtoSPIRV(llvm::Module &Mod,
+  static llvm::Expected<KernelBinary *> translateLLVMtoSPIRV(llvm::Module &Mod,
                                                             JITContext &JITCtx);
 
 private:
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index d8a59a58f8a36..f34f0ac682f7b 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -39,6 +39,21 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
   }
 }
 
+::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) {
+  auto Backend = Queue->getDeviceImplPtr()->getPlugin().getBackend();
+  switch (Backend) {
+  case backend::ext_oneapi_level_zero:
+  case backend::opencl:
+    return ::jit_compiler::BinaryFormat::SPIRV;
+  case backend::ext_oneapi_cuda:
+    return ::jit_compiler::BinaryFormat::PTX;
+  default:
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::feature_not_supported),
+        "Backend unsupported by kernel fusion");
+  }
+}
+
 std::pair<const RTDeviceBinaryImage *, RT::PiProgram>
 retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
   auto KernelName = KernelCG->getKernelName();
@@ -796,6 +811,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
   JITConfig.set<::jit_compiler::option::JITEnableCaching>(
       detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get());
+  JITConfig.set<::jit_compiler::option::JITTargetFormat>(
+      getTargetFormat(Queue));
 
   auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
       *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames,

From d32c7586cb014d2191440f2a3d050b6cc9205e8b Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Thu, 12 Jan 2023 08:56:43 +0000
Subject: [PATCH 04/25] [SYCL][Fusion] Provide correct target spec;

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 12 ------
 sycl/source/detail/jit_compiler.cpp           | 38 ++++++++++++++-----
 sycl/source/detail/jit_compiler.hpp           |  3 +-
 sycl/source/detail/jit_device_binaries.cpp    | 10 ++---
 sycl/source/detail/jit_device_binaries.hpp    |  4 +-
 5 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index eeffb0af6ced8..6c14ac182514a 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -148,18 +148,6 @@ FusionResult KernelFusion::fuseKernels(
 
   FusedKernelInfo.NDR = FusedKernel.FusedNDRange;
 
-  // Update the KernelInfo for the fused kernel with the address and size of the
-  // SPIR-V binary resulting from translation.
-  SYCLKernelBinaryInfo &FusedBinaryInfo = FusedKernelInfo.BinaryInfo;
-  FusedBinaryInfo.Format = BinaryFormat::SPIRV;
-  // Output SPIR-V should use the same number of address bits as the input
-  // SPIR-V. SPIR-V translation requires all modules to use the same number of
-  // address bits, so it's safe to take the value from the first one.
-  FusedBinaryInfo.AddressBits =
-      ModuleInfo.kernels().front().BinaryInfo.AddressBits;
-  FusedBinaryInfo.BinaryStart = SPIRVBin->address();
-  FusedBinaryInfo.BinarySize = SPIRVBin->size();
-
   if (CachingEnabled) {
     JITCtx.addCacheEntry(CacheKey, FusedKernelInfo);
   }
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index f34f0ac682f7b..40e8f946135e3 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -811,8 +811,9 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
   JITConfig.set<::jit_compiler::option::JITEnableCaching>(
       detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get());
-  JITConfig.set<::jit_compiler::option::JITTargetFormat>(
-      getTargetFormat(Queue));
+      
+  ::jit_compiler::BinaryFormat TargetFormat = getTargetFormat(Queue);
+  JITConfig.set<::jit_compiler::option::JITTargetFormat>(TargetFormat);
 
   auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
       *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames,
@@ -854,7 +855,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
 
   if (!FusionResult.cached()) {
-    auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo);
+    auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
     detail::ProgramManager::getInstance().addImages(PIDeviceBinaries);
   } else if (DebugEnabled) {
     std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
@@ -866,9 +867,11 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       FusedKernelInfo.Name);
   std::vector<std::shared_ptr<const void>> RawExtendedMembers;
 
-  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
-      detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
-          Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
+  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr;
+  if (TargetFormat == ::jit_compiler::BinaryFormat::SPIRV) {
+    detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
+        Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
+  }
 
   std::unique_ptr<detail::CG> FusedCG;
   FusedCG.reset(new detail::CGExecKernel(
@@ -881,7 +884,25 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
 }
 
 pi_device_binaries jit_compiler::createPIDeviceBinary(
-    const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) {
+    const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
+    ::jit_compiler::BinaryFormat Format) {
+
+  const char *TargetSpec = nullptr;
+  switch (Format) {
+  case ::jit_compiler::BinaryFormat::PTX: {
+    TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64;
+    break;
+  }
+  case ::jit_compiler::BinaryFormat::SPIRV: {
+    TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64)
+                     ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64
+                     : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32;
+    break;
+  }
+  default:
+    sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                    "Invalid output format");
+  }
 
   DeviceBinaryContainer Binary;
 
@@ -909,8 +930,7 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
   DeviceBinariesCollection Collection;
   Collection.addDeviceBinary(std::move(Binary),
                              FusedKernelInfo.BinaryInfo.BinaryStart,
-                             FusedKernelInfo.BinaryInfo.BinarySize,
-                             FusedKernelInfo.BinaryInfo.AddressBits);
+                             FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec);
 
   JITDeviceBinaries.push_back(std::move(Collection));
   return JITDeviceBinaries.back().getPIDeviceStruct();
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 522c0749ef75b..e02be562de3ee 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -13,6 +13,7 @@
 #include <detail/scheduler/scheduler.hpp>
 
 namespace jit_compiler {
+enum class BinaryFormat;
 class JITContext;
 struct SYCLKernelInfo;
 using ArgUsageMask = std::vector<unsigned char>;
@@ -46,7 +47,7 @@ class jit_compiler {
   jit_compiler &operator=(const jit_compiler &&) = delete;
 
   pi_device_binaries
-  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo);
+  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, ::jit_compiler::BinaryFormat Format);
 
   std::vector<uint8_t>
   encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const;
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
index 0aa778da14240..eadd71d021ee8 100644
--- a/sycl/source/detail/jit_device_binaries.cpp
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -81,7 +81,7 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) {
 }
 
 pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
-    const unsigned char *BinaryStart, size_t BinarySize, size_t AddressBits) {
+    const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec) {
   pi_device_binary_struct DeviceBinary;
   DeviceBinary.Version = PI_DEVICE_BINARY_VERSION;
   DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL;
@@ -94,9 +94,7 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
   DeviceBinary.BinaryStart = BinaryStart;
   DeviceBinary.BinaryEnd = BinaryStart + BinarySize;
   DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV;
-  DeviceBinary.DeviceTargetSpec = (AddressBits == 32)
-                                      ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32
-                                      : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64;
+  DeviceBinary.DeviceTargetSpec = TargetSpec;
   DeviceBinary.EntriesBegin = PIOffloadEntries.data();
   DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size();
   DeviceBinary.PropertySetsBegin = PIPropertySets.data();
@@ -108,14 +106,14 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
 void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont,
                                                const unsigned char *BinaryStart,
                                                size_t BinarySize,
-                                               size_t AddressBits) {
+                                               const char* TargetSpec) {
   // Adding to the vectors might trigger reallocation, which would invalidate
   // the pointers used for PI structs if a PI struct has already been created
   // via getPIDeviceStruct(). Forbid calls to this method after the first PI
   // struct has been created.
   assert(Fused && "Adding to container would invalidate existing PI structs");
   PIBinaries.push_back(
-      Cont.getPIDeviceBinary(BinaryStart, BinarySize, AddressBits));
+      Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec));
   Binaries.push_back(std::move(Cont));
 }
 
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
index 7bf2c7d9fe07b..6fa142543b61f 100644
--- a/sycl/source/detail/jit_device_binaries.hpp
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -111,7 +111,7 @@ class DeviceBinaryContainer {
 
   pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart,
                                             size_t BinarySize,
-                                            size_t AddressBits);
+                                            const char* TargetSpec);
 
 private:
   bool Fused = true;
@@ -138,7 +138,7 @@ class DeviceBinariesCollection {
 
   void addDeviceBinary(DeviceBinaryContainer &&Cont,
                        const unsigned char *BinaryStart, size_t BinarySize,
-                       size_t AddressBits);
+                       const char* TargetSpec);
   pi_device_binaries getPIDeviceStruct();
 
 private:

From b99478118db2a3fb3f00f0d24358fb7d7843eb11 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Fri, 13 Jan 2023 13:16:28 +0000
Subject: [PATCH 05/25] [SYCL][Fusion] Avoid removing dependencies

Avoid the dependencies of the dependencies to be removed when cleaning
up the input commands from the graph without executing them.

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/source/detail/scheduler/commands.hpp      | 8 ++++++++
 sycl/source/detail/scheduler/graph_builder.cpp | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 35df6ef614cc7..a019ab9378226 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -289,6 +289,14 @@ class Command {
   // XPTI instrumentation. Copy code location details to the internal struct.
   // Memory is allocated in this method and released in destructor.
   void copySubmissionCodeLocation();
+  
+  /// Clear all dependency events for device and host dependencies. This should
+  /// only be used if a command is about to be deleted without being executed
+  /// before that.
+  void clearAllDependencies() {
+    MPreparedDepsEvents.clear();
+    MPreparedHostDepsEvents.clear();
+  }
 
   /// Contains list of dependencies(edges)
   std::vector<DepDesc> MDeps;
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 5dee68c6e69d2..a404fe707698b 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1396,6 +1396,10 @@ void Scheduler::GraphBuilder::removeNodeFromGraph(
   }
 
   Node->MDeps.clear();
+  // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called
+  // from the destructor of the command to delete the dependencies of the
+  // command this command depends on.
+  Node->clearAllDependencies();
 }
 
 void Scheduler::GraphBuilder::cancelFusion(QueueImplPtr Queue,

From 651847eb18cc96cdc7f859c7d6bec282940be248 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Fri, 13 Jan 2023 13:23:14 +0000
Subject: [PATCH 06/25] [SYCL][Fusion] Set device binary image format

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/source/detail/jit_compiler.cpp        | 18 ++++++++++++------
 sycl/source/detail/jit_device_binaries.cpp |  8 ++++----
 sycl/source/detail/jit_device_binaries.hpp |  4 ++--
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 40e8f946135e3..04a09ee84a72e 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -811,7 +811,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
   JITConfig.set<::jit_compiler::option::JITEnableCaching>(
       detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get());
-      
+
   ::jit_compiler::BinaryFormat TargetFormat = getTargetFormat(Queue);
   JITConfig.set<::jit_compiler::option::JITTargetFormat>(TargetFormat);
 
@@ -854,10 +854,13 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   }(FusedKernelInfo.NDR);
   updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
 
+  OSModuleHandle Handle = OSUtil::DummyModuleHandle;
   if (!FusionResult.cached()) {
     auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
     detail::ProgramManager::getInstance().addImages(PIDeviceBinaries);
+    Handle = OSUtil::getOSModuleHandle(PIDeviceBinaries->DeviceBinaries);
   } else if (DebugEnabled) {
+    // TODO(Lukas): Create correct OSModuleHandle when using a cached binary.
     std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
   }
 
@@ -878,8 +881,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
       std::move(ArgsStorage), std::move(AccStorage),
       std::move(RawExtendedMembers), std::move(Requirements), std::move(Events),
-      std::move(FusedArgs), FusedKernelInfo.Name, OSUtil::DummyModuleHandle, {},
-      {}, CG::CGTYPE::Kernel, KernelCacheConfig));
+      std::move(FusedArgs), FusedKernelInfo.Name, Handle, {}, {},
+      CG::CGTYPE::Kernel, KernelCacheConfig));
   return FusedCG;
 }
 
@@ -888,15 +891,18 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
     ::jit_compiler::BinaryFormat Format) {
 
   const char *TargetSpec = nullptr;
+  pi_device_binary_type BinFormat = PI_DEVICE_BINARY_TYPE_NATIVE;
   switch (Format) {
   case ::jit_compiler::BinaryFormat::PTX: {
     TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64;
+    BinFormat = PI_DEVICE_BINARY_TYPE_NONE;
     break;
   }
   case ::jit_compiler::BinaryFormat::SPIRV: {
     TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64)
                      ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64
                      : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32;
+    BinFormat = PI_DEVICE_BINARY_TYPE_SPIRV;
     break;
   }
   default:
@@ -928,9 +934,9 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
   Binary.addProperty(std::move(ArgMaskPropSet));
 
   DeviceBinariesCollection Collection;
-  Collection.addDeviceBinary(std::move(Binary),
-                             FusedKernelInfo.BinaryInfo.BinaryStart,
-                             FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec);
+  Collection.addDeviceBinary(
+      std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart,
+      FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat);
 
   JITDeviceBinaries.push_back(std::move(Collection));
   return JITDeviceBinaries.back().getPIDeviceStruct();
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
index eadd71d021ee8..59530a0e691ce 100644
--- a/sycl/source/detail/jit_device_binaries.cpp
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -81,10 +81,11 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) {
 }
 
 pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
-    const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec) {
+    const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec, pi_device_binary_type Format) {
   pi_device_binary_struct DeviceBinary;
   DeviceBinary.Version = PI_DEVICE_BINARY_VERSION;
   DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL;
+  DeviceBinary.Format = Format;
   DeviceBinary.CompileOptions = "";
   DeviceBinary.LinkOptions = "";
   DeviceBinary.ManifestStart = nullptr;
@@ -93,7 +94,6 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
   // the JITContext.
   DeviceBinary.BinaryStart = BinaryStart;
   DeviceBinary.BinaryEnd = BinaryStart + BinarySize;
-  DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV;
   DeviceBinary.DeviceTargetSpec = TargetSpec;
   DeviceBinary.EntriesBegin = PIOffloadEntries.data();
   DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size();
@@ -106,14 +106,14 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
 void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont,
                                                const unsigned char *BinaryStart,
                                                size_t BinarySize,
-                                               const char* TargetSpec) {
+                                               const char* TargetSpec, pi_device_binary_type Format) {
   // Adding to the vectors might trigger reallocation, which would invalidate
   // the pointers used for PI structs if a PI struct has already been created
   // via getPIDeviceStruct(). Forbid calls to this method after the first PI
   // struct has been created.
   assert(Fused && "Adding to container would invalidate existing PI structs");
   PIBinaries.push_back(
-      Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec));
+      Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec, Format));
   Binaries.push_back(std::move(Cont));
 }
 
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
index 6fa142543b61f..96079d3a25a19 100644
--- a/sycl/source/detail/jit_device_binaries.hpp
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -111,7 +111,7 @@ class DeviceBinaryContainer {
 
   pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart,
                                             size_t BinarySize,
-                                            const char* TargetSpec);
+                                            const char* TargetSpec, pi_device_binary_type Format);
 
 private:
   bool Fused = true;
@@ -138,7 +138,7 @@ class DeviceBinariesCollection {
 
   void addDeviceBinary(DeviceBinaryContainer &&Cont,
                        const unsigned char *BinaryStart, size_t BinarySize,
-                       const char* TargetSpec);
+                       const char* TargetSpec, pi_device_binary_type Format);
   pi_device_binaries getPIDeviceStruct();
 
 private:

From 7fddbd5908acfd36116c8ce412f402947fe4b42f Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Mon, 16 Jan 2023 14:28:36 +0000
Subject: [PATCH 07/25] [SYCL][Fusion] Refactor target-specific processing

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/passes/CMakeLists.txt             |   1 +
 .../passes/kernel-fusion/SYCLKernelFusion.cpp | 133 +++--------
 .../passes/kernel-fusion/SYCLKernelFusion.h   |   9 +-
 .../passes/target/TargetFusionInfo.cpp        | 225 ++++++++++++++++++
 sycl-fusion/passes/target/TargetFusionInfo.h  | 137 +++++++++++
 5 files changed, 395 insertions(+), 110 deletions(-)
 create mode 100644 sycl-fusion/passes/target/TargetFusionInfo.cpp
 create mode 100644 sycl-fusion/passes/target/TargetFusionInfo.h

diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index fe81b76b5bbcb..41398f6ee9e99 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_library(SYCLKernelFusionPasses
   syclcp/SYCLCP.cpp
   cleanup/Cleanup.cpp
   debug/PassDebug.cpp
+  target/TargetFusionInfo.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
index 241496ac9044f..d3dec7f16a81f 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
@@ -31,12 +31,6 @@
 
 using namespace llvm;
 
-constexpr static StringLiteral KernelArgAddrSpace{"kernel_arg_addr_space"};
-constexpr static StringLiteral KernelArgAccessQual{"kernel_arg_access_qual"};
-constexpr static StringLiteral KernelArgType{"kernel_arg_type"};
-constexpr static StringLiteral KernelArgBaseType{"kernel_arg_base_type"};
-constexpr static StringLiteral KernelArgTypeQual{"kernel_arg_type_qual"};
-
 constexpr StringLiteral SYCLKernelFusion::NDRangeMDKey;
 constexpr StringLiteral SYCLKernelFusion::NDRangesMDKey;
 
@@ -144,6 +138,8 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       AM.getResult<SYCLModuleInfoAnalysis>(M).ModuleInfo;
   assert(ModuleInfo && "No module information available");
 
+  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+
   // Iterate over the functions in the module and locate all
   // stub functions identified by metadata.
   SmallPtrSet<Function *, 8> ToCleanUp;
@@ -156,7 +152,7 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       // attached to this stub function.
       // The newly created function will carry the name also specified
       // in the metadata.
-      if (auto Err = fuseKernel(M, F, ModuleInfo, ToCleanUp)) {
+      if (auto Err = fuseKernel(M, F, ModuleInfo, TFI, ToCleanUp)) {
         DeferredErrs = joinErrors(std::move(DeferredErrs), std::move(Err));
       }
       // Rembember the stub for deletion, as it is not required anymore after
@@ -164,6 +160,10 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       ToCleanUp.insert(&F);
     }
   }
+  // Notify the target-specific logic that some functions will be erased
+  // shortly.
+  SmallVector<Function *> NotifyDelete{ToCleanUp.begin(), ToCleanUp.end()};
+  TFI.notifyFunctionsDelete(NotifyDelete);
   // Delete all the stub functions
   for (Function *SF : ToCleanUp) {
     SF->eraseFromParent();
@@ -230,11 +230,13 @@ static FusionInsertPoints addGuard(IRBuilderBase &Builder,
   return {Entry, CallInsertion, Exit};
 }
 
-static Expected<CallInst *> createFusionCall(
-    IRBuilderBase &Builder, Function *F, ArrayRef<Value *> CallArgs,
-    const jit_compiler::NDRange &SrcNDRange,
-    const jit_compiler::NDRange &FusedNDRange, bool IsLast, int BarriersFlags,
-    jit_compiler::Remapper &Remapper, bool ShouldRemap) {
+static Expected<CallInst *>
+createFusionCall(IRBuilderBase &Builder, Function *F,
+                 ArrayRef<Value *> CallArgs,
+                 const jit_compiler::NDRange &SrcNDRange,
+                 const jit_compiler::NDRange &FusedNDRange, bool IsLast,
+                 int BarriersFlags, jit_compiler::Remapper &Remapper,
+                 bool ShouldRemap, TargetFusionInfo &TargetInfo) {
   const auto IPs = addGuard(Builder, SrcNDRange, FusedNDRange, IsLast);
 
   if (ShouldRemap) {
@@ -260,7 +262,7 @@ static Expected<CallInst *> createFusionCall(
 
   // Insert barrier if needed
   if (!IsLast && BarriersFlags > 0) {
-    jit_compiler::barrierCall(Builder, BarriersFlags);
+    TargetInfo.createBarrierCall(Builder, BarriersFlags);
   }
 
   // Set insert point for future insertions
@@ -271,6 +273,7 @@ static Expected<CallInst *> createFusionCall(
 
 Error SYCLKernelFusion::fuseKernel(
     Module &M, Function &StubFunction, jit_compiler::SYCLModuleInfo *ModInfo,
+    TargetFusionInfo &TargetInfo,
     SmallPtrSetImpl<Function *> &ToCleanUp) const {
   // Retrieve the metadata from the stub function.
   // The first operand of the tuple is the name that the newly created,
@@ -343,12 +346,9 @@ Error SYCLKernelFusion::fuseKernel(
   SmallVector<std::string> FusedArgNames;
   SmallVector<AttributeSet> FusedParamAttributes;
   // We must keep track of some metadata attached to each parameter.
-  // Collect it in lists, so it can be attached to the fused function later on.
-  MDList KernelArgAddressSpaces;
-  MDList KernelArgAccessQualifiers;
-  MDList KernelArgTypes;
-  MDList KernelArgBaseTypes;
-  MDList KernelArgTypeQualifiers;
+  // Collect it, so it can be attached to the fused function later on.
+  MetadataCollection MDCollection{TargetInfo.getKernelMetadataKeys()};
+
   // Add the information about the new kernel to the SYCLModuleInfo.
   // Initialize the jit_compiler::SYCLKernelInfo with the name. The remaining
   // information for functor & argument layout and attributes will be filled in
@@ -425,14 +425,7 @@ Error SYCLKernelFusion::fuseKernel(
     // Add the metadata corresponding to the used arguments to the different
     // lists. NOTE: We do not collect the "kernel_arg_name" metadata, because
     // the kernel arguments receive new names in the fused kernel.
-    addToFusedMetadata(FF, KernelArgAddrSpace, UsedArgsMask,
-                       KernelArgAddressSpaces);
-    addToFusedMetadata(FF, KernelArgAccessQual, UsedArgsMask,
-                       KernelArgAccessQualifiers);
-    addToFusedMetadata(FF, KernelArgType, UsedArgsMask, KernelArgTypes);
-    addToFusedMetadata(FF, KernelArgBaseType, UsedArgsMask, KernelArgBaseTypes);
-    addToFusedMetadata(FF, KernelArgTypeQual, UsedArgsMask,
-                       KernelArgTypeQualifiers);
+    MDCollection.collectFromFunction(FF, UsedArgsMask);
 
     // Update the fused kernel's KernelInfo with information from this input
     // kernel.
@@ -502,26 +495,15 @@ Error SYCLKernelFusion::fuseKernel(
     AI.value().setName(ArgName);
     KernelArgNames.push_back(MDString::get(LLVMCtx, ArgName));
   }
-  // Attach the fused kernel_arg_* metadata collected from the different input
+  // Attach the fused metadata collected from the different input
   // kernels to the fused function.
-  attachFusedMetadata(FusedFunction, "kernel_arg_addr_space",
-                      KernelArgAddressSpaces);
-  attachFusedMetadata(FusedFunction, "kernel_arg_access_qual",
-                      KernelArgAccessQualifiers);
-  attachFusedMetadata(FusedFunction, "kernel_arg_type", KernelArgTypes);
-  attachFusedMetadata(FusedFunction, "kernel_arg_base_type",
-                      KernelArgBaseTypes);
-  attachFusedMetadata(FusedFunction, "kernel_arg_type_qual",
-                      KernelArgTypeQualifiers);
-  attachFusedMetadata(FusedFunction, "kernel_arg_name", KernelArgNames);
+  MDCollection.attachToFunction(FusedFunction);
   // Add metadata for reqd_work_group_size and work_group_size_hint
   attachKernelAttributeMD(LLVMCtx, FusedFunction, FusedKernelInfo);
 
-  // The fused kernel should be a SPIR-V kernel again.
-  // NOTE: If this pass is used in a scenario where input and output
-  // of the compilation are not SPIR-V, care must be taken of other
-  // potential calling conventions here (e.g., nvptx).
-  FusedFunction->setCallingConv(CallingConv::SPIR_KERNEL);
+  // Mark the fused function as a kernel by calling TargetFusionInfo, because
+  // this is target-specific.
+  TargetInfo.addKernelFunction(FusedFunction);
 
   // Fusion is implemented as a two step process: In the first step, we
   // simply create calls to the functions that should be fused into this
@@ -557,9 +539,9 @@ Error SYCLKernelFusion::fuseKernel(
         unsigned ParamIdx = ParamMapping[{FuncIndex, I}];
         CallArgs.push_back(FusedFunction->getArg(ParamIdx));
       }
-      auto CallOrErr = createFusionCall(Builder, IF, CallArgs, KF.ND, NDRange,
-                                        FuncIndex == BarriersEnd, BarriersFlags,
-                                        Remapper, IsHeterogeneousNDRangesList);
+      auto CallOrErr = createFusionCall(
+          Builder, IF, CallArgs, KF.ND, NDRange, FuncIndex == BarriersEnd,
+          BarriersFlags, Remapper, IsHeterogeneousNDRangesList, TargetInfo);
       // Add to the set of original kernel functions that can be deleted after
       // fusion is complete.
       ToCleanUp.insert(IF);
@@ -602,46 +584,8 @@ Error SYCLKernelFusion::fuseKernel(
     }
   }
 
-  // Remove all existing calls of the ITT instrumentation functions. Insert new
-  // ones in the entry block of the fused kernel and every exit block if the
-  // functions are present in the module.
-  // We cannot use the existing SPIRITTAnnotations pass, because that pass might
-  // insert calls to functions not present in the module (e.g., ITT
-  // instrumentations for barriers). As the JITed module is not linked with
-  // libdevice anymore, the functions would remain unresolved and cause the
-  // driver to fail.
-  Function *StartWrapperFunc = M.getFunction(ITTStartWrapper);
-  Function *FinishWrapperFunc = M.getFunction(ITTFinishWrapper);
-  bool InsertWrappers =
-      ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
-       (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
-  auto *WrapperFuncTy =
-      FunctionType::get(Type::getVoidTy(M.getContext()), /*isVarArg*/ false);
-  for (auto &BB : *FusedFunction) {
-    for (auto Inst = BB.begin(); Inst != BB.end();) {
-      if (auto *CB = dyn_cast<CallBase>(Inst)) {
-        if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
-          Inst = Inst->eraseFromParent();
-          continue;
-        }
-      }
-      ++Inst;
-    }
-    if (InsertWrappers) {
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-        auto *WrapperCall =
-            CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
-        WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-      }
-    }
-  }
-  if (InsertWrappers) {
-    FusedFunction->getEntryBlock().getFirstInsertionPt();
-    auto *WrapperCall = CallInst::Create(
-        WrapperFuncTy, StartWrapperFunc, "",
-        &*FusedFunction->getEntryBlock().getFirstInsertionPt());
-    WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-  }
+  // Perform target-specific post-processing of the new fused kernel.
+  TargetInfo.postProcessKernel(FusedFunction);
   return Error::success();
 }
 
@@ -717,23 +661,6 @@ static unsigned getUnsignedFromMD(Metadata *MD) {
   return ConstInt->getZExtValue();
 }
 
-void SYCLKernelFusion::addToFusedMetadata(
-    Function *InputFunction, const StringRef &Kind,
-    const ArrayRef<bool> IsArgPresentMask,
-    SmallVectorImpl<Metadata *> &FusedMDList) const {
-  // Retrieve metadata from one of the input kernels and add it to the list
-  // of fused metadata.
-  assert(InputFunction->hasMetadata(Kind) &&
-         "Required Metadata not present on input kernel");
-  if (auto *MD = InputFunction->getMetadata(Kind)) {
-    for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) {
-      if (std::get<0>(MaskedOps)) {
-        FusedMDList.emplace_back(std::get<1>(MaskedOps).get());
-      }
-    }
-  }
-}
-
 void SYCLKernelFusion::attachFusedMetadata(
     Function *FusedFunction, const StringRef &Kind,
     const ArrayRef<Metadata *> FusedMetadata) const {
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
index 5e26595d2d343..5f52f0a317d14 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
@@ -10,6 +10,7 @@
 #define SYCL_FUSION_PASSES_SYCLKERNELFUSION_H
 
 #include "Kernel.h"
+#include "target/TargetFusionInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -52,8 +53,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
   // locate our own metadata again.
   static constexpr auto MetadataKind = "sycl.kernel.fused";
   static constexpr auto ParameterMDKind = "sycl.kernel.param";
-  static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
-  static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
 
   using MDList = llvm::SmallVector<llvm::Metadata *, 16>;
 
@@ -112,6 +111,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
   llvm::Error
   fuseKernel(llvm::Module &M, llvm::Function &StubFunction,
              jit_compiler::SYCLModuleInfo *ModInfo,
+             llvm::TargetFusionInfo &TargetInfo,
              llvm::SmallPtrSetImpl<llvm::Function *> &ToCleanUp) const;
 
   void canonicalizeParameters(
@@ -119,11 +119,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
 
   Parameter getParamFromMD(llvm::Metadata *MD) const;
 
-  void addToFusedMetadata(
-      llvm::Function *InputFunction, const llvm::StringRef &Kind,
-      const llvm::ArrayRef<bool> IsArgPresentMask,
-      llvm::SmallVectorImpl<llvm::Metadata *> &FusedMDList) const;
-
   void attachFusedMetadata(
       llvm::Function *FusedFunction, const llvm::StringRef &Kind,
       const llvm::ArrayRef<llvm::Metadata *> FusedMetadata) const;
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
new file mode 100644
index 0000000000000..191ab676613bb
--- /dev/null
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -0,0 +1,225 @@
+//==---------------------- TargetFusionInfo.cpp ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetFusionInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+
+using namespace llvm;
+
+//
+// TargetFusionInfo
+//
+
+TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) {
+  llvm::Triple Tri(Mod->getTargetTriple());
+  if (Tri.isNVPTX()) {
+    return TargetFusionInfo(
+        std::shared_ptr<NVPTXTargetFusionInfo>(new NVPTXTargetFusionInfo(Mod)));
+  }
+  if (Tri.isSPIRV()) {
+    return TargetFusionInfo(
+        std::shared_ptr<SPIRVTargetFusionInfo>(new SPIRVTargetFusionInfo(Mod)));
+  }
+  assert(false && "Unsupported target for fusion");
+}
+
+//
+// SPIRVTargetFusionInfo
+//
+
+void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
+  KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL);
+}
+
+ArrayRef<StringRef> SPIRVTargetFusionInfo::getKernelMetadataKeys() {
+  static SmallVector<StringRef> Keys{
+      {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type",
+       "kernel_arg_base_type", "kernel_arg_type_qual"}};
+  return Keys;
+}
+
+void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) {
+  static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
+  static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
+  // Remove all existing calls of the ITT instrumentation functions. Insert new
+  // ones in the entry block of the fused kernel and every exit block if the
+  // functions are present in the module.
+  // We cannot use the existing SPIRITTAnnotations pass, because that pass might
+  // insert calls to functions not present in the module (e.g., ITT
+  // instrumentations for barriers). As the JITed module is not linked with
+  // libdevice anymore, the functions would remain unresolved and cause the
+  // driver to fail.
+  Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper);
+  Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper);
+  bool InsertWrappers =
+      ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
+       (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
+  auto *WrapperFuncTy = FunctionType::get(
+      Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false);
+  for (auto &BB : *KernelFunc) {
+    for (auto Inst = BB.begin(); Inst != BB.end();) {
+      if (auto *CB = dyn_cast<CallBase>(Inst)) {
+        if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
+          Inst = Inst->eraseFromParent();
+          continue;
+        }
+      }
+      ++Inst;
+    }
+    if (InsertWrappers) {
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+        auto *WrapperCall =
+            CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
+        WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
+      }
+    }
+  }
+  if (InsertWrappers) {
+    KernelFunc->getEntryBlock().getFirstInsertionPt();
+    auto *WrapperCall =
+        CallInst::Create(WrapperFuncTy, StartWrapperFunc, "",
+                         &*KernelFunc->getEntryBlock().getFirstInsertionPt());
+    WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
+  }
+}
+
+void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
+                                              int BarrierFlags) {
+  if (BarrierFlags == -1) {
+    return;
+  }
+  assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) &&
+         "Invalid barrier flags");
+
+  static const auto FnAttrs = AttributeSet::get(
+      LLVMMod->getContext(),
+      {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent),
+       Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)});
+
+  static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"};
+
+  Function *F = LLVMMod->getFunction(N);
+  if (!F) {
+    constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage;
+
+    auto *Ty = FunctionType::get(
+        Builder.getVoidTy(),
+        {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
+        false /* isVarArg*/);
+
+    F = Function::Create(Ty, Linkage, N, *LLVMMod);
+
+    F->setAttributes(
+        AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
+    F->setCallingConv(CallingConv::SPIR_FUNC);
+  }
+
+  // See
+  // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
+  SmallVector<Value *> Args{
+      Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+      Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+      Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) |
+                       ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))};
+
+  auto *BarrierCallInst = Builder.CreateCall(F, Args);
+  BarrierCallInst->setAttributes(
+      AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
+  BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC);
+}
+
+//
+// NVPTXTargetFusionInfo
+//
+
+void NVPTXTargetFusionInfo::notifyFunctionsDelete(
+    llvm::ArrayRef<Function *> Funcs) {
+  SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
+  SmallVector<MDNode *> ValidKernels;
+  auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
+  for (auto *Op : OldAnnotations->operands()) {
+    if (auto *TOp = dyn_cast<MDTuple>(Op)) {
+      if (auto *COp = dyn_cast_if_present<ConstantAsMetadata>(
+              TOp->getOperand(0).get())) {
+        if (!DeletedFuncs.contains(COp->getValue())) {
+          ValidKernels.push_back(Op);
+          // Add to the set to also remove duplicate entries.
+          DeletedFuncs.insert(COp->getValue());
+        }
+      }
+    }
+  }
+  LLVMMod->eraseNamedMetadata(OldAnnotations);
+  auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+  for (auto *Kernel : ValidKernels) {
+    NewAnnotations->addOperand(Kernel);
+  }
+}
+
+void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
+  auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+  auto *MDOne = ConstantAsMetadata::get(
+      ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
+  auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel");
+  auto *MDFunc = ConstantAsMetadata::get(KernelFunc);
+  SmallVector<Metadata *, 3> KernelMD({MDFunc, MDKernelString, MDOne});
+  auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD);
+  NVVMAnnotations->addOperand(Tuple);
+}
+
+ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() {
+  // FIXME: Check whether we need to take care of sycl_fixed_targets.
+  static SmallVector<StringRef> Keys{{"kernel_arg_buffer_location",
+                                      "kernel_arg_runtime_aligned",
+                                      "kernel_arg_exclusive_ptr"}};
+  return Keys;
+}
+
+void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
+                                              int BarrierFlags) {
+  if (BarrierFlags == -1) {
+    return;
+  }
+  // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX
+  // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0
+  // instruction, equivalent to the __syncthreads() call in CUDA."
+  Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {});
+}
+
+//
+// MetadataCollection
+//
+
+MetadataCollection::MetadataCollection(ArrayRef<StringRef> MDKeys)
+    : Keys{MDKeys}, Collection(MDKeys.size()) {}
+
+void MetadataCollection::collectFromFunction(
+    llvm::Function *Func, const ArrayRef<bool> IsArgPresentMask) {
+  for (auto &Key : Keys) {
+    // TODO: Do we want to assert for the presence of the metadata here?
+    if (auto *MD = Func->getMetadata(Key)) {
+      for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) {
+        if (std::get<0>(MaskedOps)) {
+          Collection[Key].emplace_back(std::get<1>(MaskedOps).get());
+        }
+      }
+    }
+  }
+}
+
+void MetadataCollection::attachToFunction(llvm::Function *Func) {
+  for (auto &Key : Keys) {
+    // Attach a list of fused metadata for a kind to the fused function.
+    auto *MDEntries = MDNode::get(Func->getContext(), Collection[Key]);
+    Func->setMetadata(Key, MDEntries);
+  }
+}
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h
new file mode 100644
index 0000000000000..653af4904e2ae
--- /dev/null
+++ b/sycl-fusion/passes/target/TargetFusionInfo.h
@@ -0,0 +1,137 @@
+//==-- TargetFusionInfo.h - Encapsule target-specific fusion functionality -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+
+class TargetFusionInfoImpl {
+
+public:
+  virtual ~TargetFusionInfoImpl() = default;
+
+  virtual void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) {
+    (void)Funcs;
+  }
+
+  virtual void addKernelFunction(Function *KernelFunc) { (void)KernelFunc; }
+
+  virtual void postProcessKernel(Function *KernelFunc) { (void)KernelFunc; }
+
+  virtual ArrayRef<StringRef> getKernelMetadataKeys() { return {}; }
+
+  virtual void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) = 0;
+
+protected:
+  explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {};
+
+  llvm::Module *LLVMMod;
+
+  friend class TargetFusionInfo;
+};
+
+class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  void addKernelFunction(Function *KernelFunc) override;
+
+  ArrayRef<StringRef> getKernelMetadataKeys() override;
+
+  void postProcessKernel(Function *KernelFunc) override;
+
+  void createBarrierCall(IRBuilderBase& Builder, int BarrierFlags) override;
+
+private:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
+};
+
+class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) override;
+
+  void addKernelFunction(Function *KernelFunc) override;
+
+  ArrayRef<StringRef> getKernelMetadataKeys() override;
+
+  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) override;
+
+private:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
+};
+
+///
+/// Common interface to target-specific logic around handling of kernel
+/// functions.
+class TargetFusionInfo {
+public:
+  ///
+  /// Create the correct target-specific implementation based on the target
+  /// triple of \p Module.
+  static TargetFusionInfo getTargetFusionInfo(llvm::Module *Module);
+
+  ///
+  /// Notify the target-specific implementation that set of functions \p Funcs
+  /// is about to be erased from the module. This should be called BEFORE
+  /// erasing the functions.
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) {
+    Impl->notifyFunctionsDelete(Funcs);
+  }
+
+  ///
+  /// Notify the target-specific implementation that the function \p KernelFunc
+  /// was added as a new kernel. This should be called AFTER the function has
+  /// been added.
+  void addKernelFunction(llvm::Function *KernelFunc) {
+    Impl->addKernelFunction(KernelFunc);
+  }
+
+  ///
+  /// Target-specific post-processing of the new kernel function \p KernelFunc.
+  /// This should be called AFTER the function has been added and defined.
+  void postProcessKernel(Function *KernelFunc) {
+    Impl->postProcessKernel(KernelFunc);
+  }
+
+  ///
+  /// Get the target-specific list of argument metadata attached to each
+  /// function that should be collected and attached to the fused kernel.
+  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() {
+    return Impl->getKernelMetadataKeys();
+  }
+
+  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) {
+    Impl->createBarrierCall(Builder, BarrierFlags);
+  }
+
+private:
+  using ImplPtr = std::shared_ptr<TargetFusionInfoImpl>;
+
+  TargetFusionInfo(ImplPtr &&I) : Impl{I} {}
+
+  ImplPtr Impl;
+};
+
+///
+/// Simple helper to collect a target-specific set of kernel argument metadata
+/// from input functions and attach it to a fused kernel.
+class MetadataCollection {
+public:
+  explicit MetadataCollection(llvm::ArrayRef<llvm::StringRef> MDKeys);
+
+  void collectFromFunction(llvm::Function *Func,
+                           const ArrayRef<bool> IsArgPresentMask);
+
+  void attachToFunction(llvm::Function *Func);
+
+private:
+  llvm::SmallVector<llvm::StringRef> Keys;
+
+  llvm::StringMap<llvm::SmallVector<llvm::Metadata *>> Collection;
+};
+} // namespace llvm

From 894ee89494f467f16e347258246a56bf04a03a5c Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 17 Jan 2023 08:19:21 +0000
Subject: [PATCH 08/25] [SYCL][Fusion] Do not require null terminator

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 .../jit-compiler/lib/translation/KernelTranslation.cpp       | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 31fa850d58d63..7e1318c70a4a9 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -111,7 +111,10 @@ KernelTranslator::loadLLVMKernel(llvm::LLVMContext &LLVMCtx,
   llvm::StringRef RawData(reinterpret_cast<const char *>(BinInfo.BinaryStart),
                           BinInfo.BinarySize);
   return llvm::parseBitcodeFile(
-      MemoryBuffer::getMemBuffer(RawData)->getMemBufferRef(), LLVMCtx);
+      MemoryBuffer::getMemBuffer(RawData, Kernel.Name,
+                                 /* RequiresNullTermnator*/ false)
+          ->getMemBufferRef(),
+      LLVMCtx);
 }
 
 llvm::Expected<std::unique_ptr<llvm::Module>>

From a909a75bc2fcf3f80ee1835ffdcc6187dfdb6cad Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 17 Jan 2023 14:42:40 +0000
Subject: [PATCH 09/25] [SYCL][Fusion] Refactor more target-specific code

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |  1 -
 .../lib/fusion/FusionPipeline.cpp             | 19 ++++-
 .../lib/translation/KernelTranslation.cpp     |  3 -
 sycl-fusion/passes/CMakeLists.txt             |  1 +
 sycl-fusion/passes/cleanup/Cleanup.cpp        | 17 ++--
 sycl-fusion/passes/cleanup/Cleanup.h          |  4 +-
 .../internalization/Internalization.cpp       | 61 ++++---------
 .../passes/kernel-fusion/SYCLKernelFusion.cpp | 13 +--
 sycl-fusion/passes/syclcp/SYCLCP.cpp          | 12 +--
 .../passes/target/TargetFusionInfo.cpp        | 60 +++++++++++--
 sycl-fusion/passes/target/TargetFusionInfo.h  | 85 +++++++++++++++----
 11 files changed, 177 insertions(+), 99 deletions(-)

diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 6c14ac182514a..821ad4cd36369 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -137,7 +137,6 @@ FusionResult KernelFusion::fuseKernels(
 
   SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName);
 
-  // TODO
   BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
 
   if (auto Error = translation::KernelTranslator::translateKernel(
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index 3ede007aa69a4..fb2e2b15200a7 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -23,6 +23,7 @@
 #ifndef NDEBUG
 #include "llvm/IR/Verifier.h"
 #endif // NDEBUG
+#include "llvm/ADT/Triple.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
@@ -35,6 +36,21 @@ using namespace llvm;
 using namespace jit_compiler;
 using namespace jit_compiler::fusion;
 
+static unsigned getFlatAddressSpace(Module &Mod) {
+  // Ideally, we could get this information from the TargetTransformInfo, but
+  // the SPIR-V backend does not yet seem to have an implementation for that.
+  llvm::Triple Tri(Mod.getTargetTriple());
+  if (Tri.isNVPTX()) {
+    return 0;
+  }
+  if (Tri.isSPIRV() || Tri.isSPIR()) {
+    return 4;
+  }
+  // Identical to the definition of "UninitializedAddressSpace" in
+  // "InferAddressSpaces.cpp".
+  return std::numeric_limits<unsigned>::max();
+}
+
 std::unique_ptr<SYCLModuleInfo>
 FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
                                 int BarriersFlags) {
@@ -86,9 +102,8 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
     // Run the InferAddressSpace pass to remove as many address-space casts
     // to/from generic address-space as possible, because these hinder
     // internalization.
-    // FIXME: TTI should tell the pass which address space to use.
     // Ideally, the static compiler should have performed that job.
-    constexpr unsigned FlatAddressSpace = 4;
+    unsigned FlatAddressSpace = getFlatAddressSpace(Mod);
     FPM.addPass(InferAddressSpacesPass(FlatAddressSpace));
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 7e1318c70a4a9..3db02dbfd717c 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -217,8 +217,5 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
     ASMStream.flush();
   }
 
-  llvm::dbgs() << "PTX size: " << PTXASM.size() << "\n";
-  llvm::dbgs() << "PTX:\n" << PTXASM << "\n";
-
   return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX);
 }
diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index 41398f6ee9e99..c74c3c40e34e0 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(SYCLKernelFusion MODULE
   syclcp/SYCLCP.cpp
   cleanup/Cleanup.cpp
   debug/PassDebug.cpp
+  target/TargetFusionInfo.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/sycl-fusion/passes/cleanup/Cleanup.cpp b/sycl-fusion/passes/cleanup/Cleanup.cpp
index 07fc86d9d0dd3..a5bc3e634c527 100644
--- a/sycl-fusion/passes/cleanup/Cleanup.cpp
+++ b/sycl-fusion/passes/cleanup/Cleanup.cpp
@@ -45,7 +45,8 @@ static void copyAttributesFrom(const BitVector &Mask, Function *NF,
                                        PAL.getRetAttrs(), Attributes));
 }
 
-static Function *createMaskedFunction(const BitVector &Mask, Function *F) {
+static Function *createMaskedFunction(const BitVector &Mask, Function *F,
+                                      TargetFusionInfo &TFI) {
   // Declare
   FunctionType *NFTy = createMaskedFunctionType(Mask, F->getFunctionType());
   Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(),
@@ -78,7 +79,9 @@ static Function *createMaskedFunction(const BitVector &Mask, Function *F) {
   }
 
   // Erase old function
+  TFI.notifyFunctionsDelete(F);
   F->eraseFromParent();
+  TFI.addKernelFunction(NF);
   return NF;
 }
 
@@ -104,9 +107,9 @@ static void updateArgUsageMask(jit_compiler::SYCLKernelInfo *Info,
 
 static void applyArgMask(const jit_compiler::ArgUsageMask &NewArgInfo,
                          const BitVector &Mask, Function *F,
-                         ModuleAnalysisManager &AM) {
+                         ModuleAnalysisManager &AM, TargetFusionInfo &TFI) {
   // Create the function without the masked-out args.
-  Function *NF = createMaskedFunction(Mask, F);
+  Function *NF = createMaskedFunction(Mask, F, TFI);
   // Update the unused args mask.
   jit_compiler::SYCLModuleInfo *ModuleInfo =
       AM.getResult<SYCLModuleInfoAnalysis>(*NF->getParent()).ModuleInfo;
@@ -125,9 +128,7 @@ static void maskMD(const BitVector &Mask, Function *F) {
   SmallVector<std::pair<unsigned, MDNode *>> MD;
   F->getAllMetadata(MD);
   for (const auto &Entry : MD) {
-    auto MDKind = Entry.first;
-    if (MDKind == F->getContext().getMDKindID("reqd_work_group_size") ||
-        MDKind == F->getContext().getMDKindID("work_group_size_hint")) {
+    if (Entry.second->getNumOperands() != Mask.size()) {
       // Some metadata, e.g., the metadata for reqd_work_group_size and
       // work_group_size_hint is independent from the number of arguments
       // and must not be filtered by the argument usage mask.
@@ -144,7 +145,7 @@ static void maskMD(const BitVector &Mask, Function *F) {
 
 void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo,
                        Function *F, ModuleAnalysisManager &AM,
-                       ArrayRef<StringRef> MDToErase) {
+                       TargetFusionInfo &TFI, ArrayRef<StringRef> MDToErase) {
   // Erase metadata.
   for (auto Key : MDToErase) {
     F->setMetadata(Key, nullptr);
@@ -158,5 +159,5 @@ void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo,
   // Update metadata.
   maskMD(CleanupMask, F);
   // Remove arguments.
-  applyArgMask(ArgUsageInfo, CleanupMask, F, AM);
+  applyArgMask(ArgUsageInfo, CleanupMask, F, AM, TFI);
 }
diff --git a/sycl-fusion/passes/cleanup/Cleanup.h b/sycl-fusion/passes/cleanup/Cleanup.h
index 49619e4b9af07..491d96f46a886 100644
--- a/sycl-fusion/passes/cleanup/Cleanup.h
+++ b/sycl-fusion/passes/cleanup/Cleanup.h
@@ -10,6 +10,7 @@
 #define SYCL_FUSION_PASSES_CLEANUP_H
 
 #include "Kernel.h"
+#include "target/TargetFusionInfo.h"
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/ADT/BitVector.h>
 #include <llvm/ADT/StringRef.h>
@@ -25,7 +26,8 @@ namespace llvm {
 /// @param[in] AM Module analysis manager.
 /// @param[in] EraseMD Keys of metadata to remove.
 void fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo, Function *F,
-                 ModuleAnalysisManager &AM, ArrayRef<StringRef> EraseMD);
+                 ModuleAnalysisManager &AM, TargetFusionInfo &TFI,
+                 ArrayRef<StringRef> EraseMD);
 } // namespace llvm
 
 #endif // SYCL_FUSION_PASSES_CLEANUP_H
diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp
index 61f3a0738921b..bca46ba43acb3 100644
--- a/sycl-fusion/passes/internalization/Internalization.cpp
+++ b/sycl-fusion/passes/internalization/Internalization.cpp
@@ -19,16 +19,12 @@
 #include "cleanup/Cleanup.h"
 #include "debug/PassDebug.h"
 #include "metadata/MDParsing.h"
+#include "target/TargetFusionInfo.h"
 
 #define DEBUG_TYPE "sycl-fusion"
 
 using namespace llvm;
 
-// Corresponds to definition of spir_private and spir_local in
-// "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
-constexpr static unsigned PrivateAS{0};
-constexpr static unsigned LocalAS{3};
-
 constexpr static StringLiteral PrivatePromotion{"private"};
 constexpr static StringLiteral LocalPromotion{"local"};
 constexpr static StringLiteral NoPromotion{"none"};
@@ -44,6 +40,8 @@ struct SYCLInternalizerImpl {
   StringRef Kind;
   /// Whether or not to create allocas.
   bool CreateAllocas;
+  /// Interface to target-specific information.
+  TargetFusionInfo TargetInfo;
 
   /// Implements internalization the pass run.
   PreservedAnalyses operator()(Module &M, ModuleAnalysisManager &AM) const;
@@ -338,11 +336,14 @@ Error SYCLInternalizerImpl::checkArgsPromotable(
 
 ///
 /// Function to perform the required cleaning actions.
-static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal) {
+static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal,
+                    const TargetFusionInfo &TFI) {
   if (!KeepOriginal) {
     NewF->takeName(OldF);
+    TFI.notifyFunctionsDelete(OldF);
     OldF->eraseFromParent();
   }
+  TFI.addKernelFunction(NewF);
 }
 
 void SYCLInternalizerImpl::promoteCall(CallBase *C, const Value *Val,
@@ -499,11 +500,6 @@ Value *replaceByNewAlloca(Argument *Arg, unsigned AS, std::size_t LocalSize) {
 Function *SYCLInternalizerImpl::promoteFunctionArgs(
     Function *OldF, ArrayRef<std::size_t> PromoteToLocal, bool CreateAllocas,
     bool KeepOriginal) const {
-  constexpr unsigned AddressSpaceBitWidth{32};
-
-  auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
-      IntegerType::get(OldF->getContext(), AddressSpaceBitWidth), AS));
-
   // We first declare the promoted function with the new signature.
   Function *NewF =
       getPromotedFunctionDeclaration(OldF, PromoteToLocal, AS,
@@ -542,32 +538,9 @@ Function *SYCLInternalizerImpl::promoteFunctionArgs(
     promoteValue(Arg, LocalSize);
   }
 
-  {
-    constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"};
-    if (auto *AddrspaceMD =
-            dyn_cast_or_null<MDNode>(NewF->getMetadata(KernelArgAddrSpaceMD))) {
-      // If we have kernel_arg_addr_space metadata in the original function,
-      // we should update it in the new one.
-      SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
-                                      AddrspaceMD->op_end()};
-      for (auto I : enumerate(PromoteToLocal)) {
-        if (I.value() == 0) {
-          continue;
-        }
-        const auto Index = I.index();
-        if (const auto *PtrTy =
-                dyn_cast<PointerType>(NewF->getArg(Index)->getType())) {
-          if (PtrTy->getAddressSpace() == LocalAS) {
-            NewInfo[Index] = NewAddrspace;
-          }
-        }
-      }
-      NewF->setMetadata(KernelArgAddrSpaceMD,
-                        MDNode::get(NewF->getContext(), NewInfo));
-    }
-  }
+  TargetInfo.updateAddressSpaceMetadata(NewF, PromoteToLocal, AS);
 
-  cleanup(OldF, NewF, KeepOriginal);
+  cleanup(OldF, NewF, KeepOriginal, TargetInfo);
 
   return NewF;
 }
@@ -625,7 +598,8 @@ SYCLInternalizerImpl::operator()(Module &M, ModuleAnalysisManager &AM) const {
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
-static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
+static void moduleCleanup(Module &M, ModuleAnalysisManager &AM,
+                          TargetFusionInfo &TFI) {
   SmallVector<Function *> ToProcess;
   for (auto &F : M) {
     if (F.hasMetadata(SYCLInternalizer::Key)) {
@@ -650,24 +624,25 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
         NewArgInfo.push_back(jit_compiler::ArgUsage::Used);
       }
     }
-    fullCleanup(NewArgInfo, F, AM,
+    fullCleanup(NewArgInfo, F, AM, TFI,
                 {SYCLInternalizer::Key, SYCLInternalizer::LocalSizeKey});
   }
 }
 
 PreservedAnalyses llvm::SYCLInternalizer::run(Module &M,
                                               ModuleAnalysisManager &AM) {
+  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
   // Private promotion
-  const PreservedAnalyses Tmp =
-      SYCLInternalizerImpl{PrivateAS, PrivatePromotion, true}(M, AM);
+  const PreservedAnalyses Tmp = SYCLInternalizerImpl{
+      TFI.getPrivateAddressSpace(), PrivatePromotion, true, TFI}(M, AM);
   // Local promotion
-  PreservedAnalyses Res =
-      SYCLInternalizerImpl{LocalAS, LocalPromotion, false}(M, AM);
+  PreservedAnalyses Res = SYCLInternalizerImpl{
+      TFI.getLocalAddressSpace(), LocalPromotion, false, TFI}(M, AM);
 
   Res.intersect(Tmp);
 
   if (!Res.areAllPreserved()) {
-    moduleCleanup(M, AM);
+    moduleCleanup(M, AM, TFI);
   }
   return Res;
 }
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
index d3dec7f16a81f..a0079affbaa06 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
@@ -423,8 +423,7 @@ Error SYCLKernelFusion::fuseKernel(
       ++ParamIndex;
     }
     // Add the metadata corresponding to the used arguments to the different
-    // lists. NOTE: We do not collect the "kernel_arg_name" metadata, because
-    // the kernel arguments receive new names in the fused kernel.
+    // lists.
     MDCollection.collectFromFunction(FF, UsedArgsMask);
 
     // Update the fused kernel's KernelInfo with information from this input
@@ -484,16 +483,12 @@ Error SYCLKernelFusion::fuseKernel(
   }
 
   // Attach names to the arguments. The name includes a prefix for the kernel
-  // from which this argument came. The names are also attached as metadata
-  // with kind "kernel_arg_name".
-  // NOTE: While the kernel_arg_name metadata is required, naming the
-  // parameters themselves is not necessary for functionality, it just improves
-  // readibility for debugging purposes.
-  SmallVector<Metadata *, 16> KernelArgNames;
+  // from which this argument came. Naming the parameters themselves is not
+  // necessary for functionality, it just improves readibility for debugging
+  // purposes.
   for (const auto &AI : llvm::enumerate(FusedFunction->args())) {
     auto &ArgName = FusedArgNames[AI.index()];
     AI.value().setName(ArgName);
-    KernelArgNames.push_back(MDString::get(LLVMCtx, ArgName));
   }
   // Attach the fused metadata collected from the different input
   // kernels to the fused function.
diff --git a/sycl-fusion/passes/syclcp/SYCLCP.cpp b/sycl-fusion/passes/syclcp/SYCLCP.cpp
index b520620c232d1..5cb9b00433ef7 100644
--- a/sycl-fusion/passes/syclcp/SYCLCP.cpp
+++ b/sycl-fusion/passes/syclcp/SYCLCP.cpp
@@ -41,7 +41,7 @@ static Expected<SmallVector<ConstantInfo>> getCPFromMD(Function *F) {
   MDNode *MD = F->getMetadata(SYCLCP::Key);
   if (!MD) {
     return createStringError(inconvertibleErrorCode(),
-                             "Private promotion metadata not available");
+                             "Constant progagation metadata not available");
   }
   for (auto I : enumerate(MD->operands())) {
     Expected<SmallVector<unsigned char>> Val =
@@ -205,7 +205,8 @@ static bool propagateConstants(Function *F, ArrayRef<ConstantInfo> Constants) {
   return Changed;
 }
 
-static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
+static void moduleCleanup(Module &M, ModuleAnalysisManager &AM,
+                          TargetFusionInfo &TFI) {
   SmallVector<Function *> ToProcess;
   for (auto &F : M) {
     if (F.hasMetadata(SYCLCP::Key)) {
@@ -219,14 +220,13 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) {
       if (const auto *MDS = dyn_cast<MDString>(I.value().get())) {
         // A value is masked-out if it has a non-empty MDString
         if (MDS->getLength() > 0) {
-          // And is either an integer or a FP number.
           NewArgInfo.push_back(jit_compiler::ArgUsage::Unused);
           continue;
         }
       }
       NewArgInfo.push_back(jit_compiler::ArgUsage::Used);
     }
-    fullCleanup(NewArgInfo, F, AM, {SYCLCP::Key});
+    fullCleanup(NewArgInfo, F, AM, TFI, {SYCLCP::Key});
   }
 }
 
@@ -249,8 +249,10 @@ PreservedAnalyses SYCLCP::run(Module &M, ModuleAnalysisManager &AM) {
     Changed = propagateConstants(F, *ConstantsOrErr) || Changed;
   }
 
+  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+
   if (Changed) {
-    moduleCleanup(M, AM);
+    moduleCleanup(M, AM, TFI);
   }
 
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
index 191ab676613bb..c17e2464ac323 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.cpp
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -25,7 +25,7 @@ TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) {
     return TargetFusionInfo(
         std::shared_ptr<NVPTXTargetFusionInfo>(new NVPTXTargetFusionInfo(Mod)));
   }
-  if (Tri.isSPIRV()) {
+  if (Tri.isSPIRV() || Tri.isSPIR()) {
     return TargetFusionInfo(
         std::shared_ptr<SPIRVTargetFusionInfo>(new SPIRVTargetFusionInfo(Mod)));
   }
@@ -36,18 +36,28 @@ TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) {
 // SPIRVTargetFusionInfo
 //
 
-void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
+void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) const {
   KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL);
 }
 
-ArrayRef<StringRef> SPIRVTargetFusionInfo::getKernelMetadataKeys() {
+ArrayRef<StringRef> SPIRVTargetFusionInfo::getKernelMetadataKeys() const {
+  // NOTE: We do not collect the "kernel_arg_name" metadata, because
+  // the kernel arguments receive new names in the fused kernel.
   static SmallVector<StringRef> Keys{
       {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type",
        "kernel_arg_base_type", "kernel_arg_type_qual"}};
   return Keys;
 }
 
-void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) {
+void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) const {
+  // Attach the kernel_arg_name metadata.
+  SmallVector<Metadata *, 16> KernelArgNames;
+  for (auto &P : KernelFunc->args()) {
+    KernelArgNames.push_back(MDString::get(LLVMMod->getContext(), P.getName()));
+  }
+  auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames);
+  KernelFunc->setMetadata("kernel_arg_name", ArgNameMD);
+
   static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
   static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
   // Remove all existing calls of the ITT instrumentation functions. Insert new
@@ -93,7 +103,7 @@ void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) {
 }
 
 void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
-                                              int BarrierFlags) {
+                                              int BarrierFlags) const {
   if (BarrierFlags == -1) {
     return;
   }
@@ -137,12 +147,44 @@ void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
   BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC);
 }
 
+void SPIRVTargetFusionInfo::updateAddressSpaceMetadata(
+    Function *KernelFunc, ArrayRef<size_t> LocalSize,
+    unsigned AddressSpace) const {
+  static constexpr unsigned AddressSpaceBitWidth{32};
+  static constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"};
+
+  auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
+      IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth),
+      AddressSpace));
+  if (auto *AddrspaceMD = dyn_cast_or_null<MDNode>(
+          KernelFunc->getMetadata(KernelArgAddrSpaceMD))) {
+    // If we have kernel_arg_addr_space metadata in the original function,
+    // we should update it in the new one.
+    SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
+                                    AddrspaceMD->op_end()};
+    for (auto I : enumerate(LocalSize)) {
+      if (I.value() == 0) {
+        continue;
+      }
+      const auto Index = I.index();
+      if (const auto *PtrTy =
+              dyn_cast<PointerType>(KernelFunc->getArg(Index)->getType())) {
+        if (PtrTy->getAddressSpace() == getLocalAddressSpace()) {
+          NewInfo[Index] = NewAddrspace;
+        }
+      }
+    }
+    KernelFunc->setMetadata(KernelArgAddrSpaceMD,
+                            MDNode::get(KernelFunc->getContext(), NewInfo));
+  }
+}
+
 //
 // NVPTXTargetFusionInfo
 //
 
 void NVPTXTargetFusionInfo::notifyFunctionsDelete(
-    llvm::ArrayRef<Function *> Funcs) {
+    llvm::ArrayRef<Function *> Funcs) const {
   SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
   SmallVector<MDNode *> ValidKernels;
   auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
@@ -165,7 +207,7 @@ void NVPTXTargetFusionInfo::notifyFunctionsDelete(
   }
 }
 
-void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
+void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) const {
   auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
   auto *MDOne = ConstantAsMetadata::get(
       ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
@@ -176,7 +218,7 @@ void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) {
   NVVMAnnotations->addOperand(Tuple);
 }
 
-ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() {
+ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() const {
   // FIXME: Check whether we need to take care of sycl_fixed_targets.
   static SmallVector<StringRef> Keys{{"kernel_arg_buffer_location",
                                       "kernel_arg_runtime_aligned",
@@ -185,7 +227,7 @@ ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() {
 }
 
 void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
-                                              int BarrierFlags) {
+                                              int BarrierFlags) const {
   if (BarrierFlags == -1) {
     return;
   }
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h
index 653af4904e2ae..c307de2dd764e 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.h
+++ b/sycl-fusion/passes/target/TargetFusionInfo.h
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
+#define SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
+
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
@@ -17,17 +20,33 @@ class TargetFusionInfoImpl {
 public:
   virtual ~TargetFusionInfoImpl() = default;
 
-  virtual void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) {
+  virtual void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const {
     (void)Funcs;
   }
 
-  virtual void addKernelFunction(Function *KernelFunc) { (void)KernelFunc; }
+  virtual void addKernelFunction(Function *KernelFunc) const {
+    (void)KernelFunc;
+  }
+
+  virtual void postProcessKernel(Function *KernelFunc) const {
+    (void)KernelFunc;
+  }
+
+  virtual ArrayRef<StringRef> getKernelMetadataKeys() const { return {}; }
+
+  virtual void createBarrierCall(IRBuilderBase &Builder,
+                                 int BarrierFlags) const = 0;
 
-  virtual void postProcessKernel(Function *KernelFunc) { (void)KernelFunc; }
+  virtual unsigned getPrivateAddressSpace() const = 0;
 
-  virtual ArrayRef<StringRef> getKernelMetadataKeys() { return {}; }
+  virtual unsigned getLocalAddressSpace() const = 0;
 
-  virtual void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) = 0;
+  virtual void updateAddressSpaceMetadata(Function *KernelFunc,
+                                          ArrayRef<size_t> LocalSize,
+                                          unsigned AddressSpace) const {
+    (void)KernelFunc;
+    (void)LocalSize;
+  }
 
 protected:
   explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {};
@@ -39,13 +58,23 @@ class TargetFusionInfoImpl {
 
 class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
 public:
-  void addKernelFunction(Function *KernelFunc) override;
+  void addKernelFunction(Function *KernelFunc) const override;
+
+  ArrayRef<StringRef> getKernelMetadataKeys() const override;
 
-  ArrayRef<StringRef> getKernelMetadataKeys() override;
+  void postProcessKernel(Function *KernelFunc) const override;
 
-  void postProcessKernel(Function *KernelFunc) override;
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override;
 
-  void createBarrierCall(IRBuilderBase& Builder, int BarrierFlags) override;
+  // Corresponds to definition of spir_private and spir_local in
+  // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
+
+  void updateAddressSpaceMetadata(Function *KernelFunc,
+                                  ArrayRef<size_t> LocalSize,
+                                  unsigned AddressSpace) const override;
 
 private:
   using TargetFusionInfoImpl::TargetFusionInfoImpl;
@@ -53,13 +82,19 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
 
 class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
 public:
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) override;
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override;
+
+  void addKernelFunction(Function *KernelFunc) const override;
 
-  void addKernelFunction(Function *KernelFunc) override;
+  ArrayRef<StringRef> getKernelMetadataKeys() const override;
 
-  ArrayRef<StringRef> getKernelMetadataKeys() override;
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override;
 
-  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) override;
+  // Corresponds to the definitions in the LLVM NVPTX backend user guide:
+  // https://llvm.org/docs/NVPTXUsage.html#address-spaces
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
 
 private:
   using TargetFusionInfoImpl::TargetFusionInfoImpl;
@@ -79,7 +114,7 @@ class TargetFusionInfo {
   /// Notify the target-specific implementation that set of functions \p Funcs
   /// is about to be erased from the module. This should be called BEFORE
   /// erasing the functions.
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) {
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const {
     Impl->notifyFunctionsDelete(Funcs);
   }
 
@@ -87,28 +122,40 @@ class TargetFusionInfo {
   /// Notify the target-specific implementation that the function \p KernelFunc
   /// was added as a new kernel. This should be called AFTER the function has
   /// been added.
-  void addKernelFunction(llvm::Function *KernelFunc) {
+  void addKernelFunction(llvm::Function *KernelFunc) const {
     Impl->addKernelFunction(KernelFunc);
   }
 
   ///
   /// Target-specific post-processing of the new kernel function \p KernelFunc.
   /// This should be called AFTER the function has been added and defined.
-  void postProcessKernel(Function *KernelFunc) {
+  void postProcessKernel(Function *KernelFunc) const {
     Impl->postProcessKernel(KernelFunc);
   }
 
   ///
   /// Get the target-specific list of argument metadata attached to each
   /// function that should be collected and attached to the fused kernel.
-  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() {
+  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() const {
     return Impl->getKernelMetadataKeys();
   }
 
-  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) {
+  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const {
     Impl->createBarrierCall(Builder, BarrierFlags);
   }
 
+  unsigned getPrivateAddressSpace() const {
+    return Impl->getPrivateAddressSpace();
+  }
+
+  unsigned getLocalAddressSpace() const { return Impl->getLocalAddressSpace(); }
+
+  void updateAddressSpaceMetadata(Function *KernelFunc,
+                                  ArrayRef<size_t> LocalSize,
+                                  unsigned AddressSpace) const {
+    Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace);
+  }
+
 private:
   using ImplPtr = std::shared_ptr<TargetFusionInfoImpl>;
 
@@ -135,3 +182,5 @@ class MetadataCollection {
   llvm::StringMap<llvm::SmallVector<llvm::Metadata *>> Collection;
 };
 } // namespace llvm
+
+#endif // SYCL_FUSION_PASSES_TARGETFUSIONINFO_H

From 32172f0bf088efd70b854fd334bf9f415aa7706c Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 14 Feb 2023 13:20:48 +0000
Subject: [PATCH 10/25] [SYCL][Fusion] Handle attributes for CUDA fusion

Handle reqd_work_group_size and work_group_size_hint attributes.

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/include/JITContext.h |  3 +-
 .../lib/translation/KernelTranslation.cpp     | 42 +++++++++++++++++++
 .../lib/translation/KernelTranslation.h       |  2 +-
 .../lib/translation/SPIRVLLVMTranslation.cpp  | 37 ----------------
 .../lib/translation/SPIRVLLVMTranslation.h    | 18 +-------
 sycl/source/detail/jit_compiler.cpp           | 40 ++++++++++++++++++
 sycl/source/detail/jit_compiler.hpp           |  7 +++-
 sycl/source/detail/jit_device_binaries.cpp    |  6 ++-
 sycl/source/detail/jit_device_binaries.hpp    |  5 ++-
 9 files changed, 99 insertions(+), 61 deletions(-)

diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index d4654f19820ce..c26312ceb70e6 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -65,8 +65,7 @@ class JITContext {
 
   llvm::LLVMContext *getLLVMContext();
 
-  KernelBinary &emplaceSPIRVBinary(std::string Binary,
-                                   BinaryFormat Format);
+  KernelBinary &emplaceSPIRVBinary(std::string Binary, BinaryFormat Format);
 
   std::optional<SYCLKernelInfo> getCacheEntry(CacheKeyT &Identifier) const;
 
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 3db02dbfd717c..1436f51f40aaa 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -9,6 +9,7 @@
 #include "KernelTranslation.h"
 #include "SPIRVLLVMTranslation.h"
 #include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -21,6 +22,43 @@ using namespace jit_compiler;
 using namespace jit_compiler::translation;
 using namespace llvm;
 
+///
+/// Get an attribute value consisting of NumValues scalar constant integers
+/// from the MDNode.
+static void getAttributeValues(std::vector<std::string> &Values, MDNode *MD) {
+  for (const auto &MDOp : MD->operands()) {
+    auto *ConstantMD = cast<ConstantAsMetadata>(MDOp);
+    auto *ConstInt = cast<ConstantInt>(ConstantMD->getValue());
+    Values.push_back(std::to_string(ConstInt->getZExtValue()));
+  }
+}
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size";
+// NOLINTNEXTLINE(readability-identifier-naming)
+static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint";
+
+///
+/// Restore kernel attributes for the kernel in Info from the metadata
+/// attached to its kernel function in the LLVM module Mod.
+/// Currently supported attributes:
+///   - reqd_work_group_size
+///   - work_group_size_hint
+static void restoreKernelAttributes(Module *Mod, SYCLKernelInfo &Info) {
+  auto *KernelFunction = Mod->getFunction(Info.Name);
+  assert(KernelFunction && "Kernel function not present in module");
+  if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) {
+    SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR};
+    getAttributeValues(ReqdAttr.Values, MD);
+    Info.Attributes.push_back(ReqdAttr);
+  }
+  if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) {
+    SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR};
+    getAttributeValues(HintAttr.Values, MD);
+    Info.Attributes.push_back(HintAttr);
+  }
+}
+
 llvm::Expected<std::unique_ptr<llvm::Module>>
 KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx,
                               std::vector<SYCLKernelInfo> &Kernels) {
@@ -100,6 +138,10 @@ KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx,
             "Number of address bits between SPIR-V modules does not match");
       }
     }
+    // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
+    // 'work_group_size_hint' from metadata attached to the kernel function and
+    // store it in the SYCLKernelInfo.
+    restoreKernelAttributes(Result.get(), Kernel);
   }
   return std::move(Result);
 }
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
index 3d6824edbdd37..276b6e536d269 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -23,7 +23,7 @@ class KernelTranslator {
   loadKernels(llvm::LLVMContext &LLVMCtx, std::vector<SYCLKernelInfo> &Kernels);
 
   static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod,
-                              JITContext &JITCtx, BinaryFormat Format);
+                                     JITContext &JITCtx, BinaryFormat Format);
 
 private:
   ///
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
index 1046fd6dc2907..a5effb33a85e1 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
@@ -11,9 +11,7 @@
 #include "Kernel.h"
 #include "LLVMSPIRVLib.h"
 #include "helper/ErrorHandling.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/raw_ostream.h"
@@ -24,37 +22,6 @@ using namespace jit_compiler;
 using namespace jit_compiler::translation;
 using namespace llvm;
 
-void SPIRVLLVMTranslator::getAttributeValues(std::vector<std::string> &Values,
-                                             MDNode *MD, size_t NumValues) {
-  assert(MD->getNumOperands() == NumValues && "Incorrect number of values");
-  for (const auto &MDOp : MD->operands()) {
-    auto *ConstantMD = cast<ConstantAsMetadata>(MDOp);
-    auto *ConstInt = cast<ConstantInt>(ConstantMD->getValue());
-    Values.push_back(std::to_string(ConstInt->getZExtValue()));
-  }
-}
-
-// NOLINTNEXTLINE(readability-identifier-naming)
-static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size";
-// NOLINTNEXTLINE(readability-identifier-naming)
-static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint";
-
-void SPIRVLLVMTranslator::restoreKernelAttributes(Module *Mod,
-                                                  SYCLKernelInfo &Info) {
-  auto *KernelFunction = Mod->getFunction(Info.Name);
-  assert(KernelFunction && "Kernel function not present in module");
-  if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) {
-    SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR};
-    getAttributeValues(ReqdAttr.Values, MD, 3);
-    Info.Attributes.push_back(ReqdAttr);
-  }
-  if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) {
-    SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR};
-    getAttributeValues(HintAttr.Values, MD, 3);
-    Info.Attributes.push_back(HintAttr);
-  }
-}
-
 SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() {
   static auto Opts = []() -> SPIRV::TranslatorOpts {
     // Options for translation between SPIR-V and LLVM IR.
@@ -113,10 +80,6 @@ SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
   }
   std::unique_ptr<Module> NewMod{LLVMMod};
 
-  // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
-  // 'work_group_size_hint' from metadata attached to the kernel function and
-  // store it in the SYCLKernelInfo.
-  restoreKernelAttributes(NewMod.get(), Kernel);
   return std::move(NewMod);
 }
 
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
index 440c00103b0d5..c8cdf2bf90ca0 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h
@@ -30,24 +30,10 @@ class SPIRVLLVMTranslator {
   ///
   /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and
   /// return a pointer to its container.
-  static llvm::Expected<KernelBinary *> translateLLVMtoSPIRV(llvm::Module &Mod,
-                                                            JITContext &JITCtx);
+  static llvm::Expected<KernelBinary *>
+  translateLLVMtoSPIRV(llvm::Module &Mod, JITContext &JITCtx);
 
 private:
-  ///
-  /// Get an attribute value consisting of NumValues scalar constant integers
-  /// from the MDNode.
-  static void getAttributeValues(std::vector<std::string> &Values,
-                                 llvm::MDNode *MD, size_t NumValues);
-
-  ///
-  /// Restore kernel attributes for the kernel in Info from the metadata
-  /// attached to its kernel function in the LLVM module Mod.
-  /// Currently supported attributes:
-  ///   - reqd_work_group_size
-  ///   - work_group_size_hint
-  static void restoreKernelAttributes(llvm::Module *Mod, SYCLKernelInfo &Info);
-
   ///
   /// Default settings for the SPIRV translation options.
   static SPIRV::TranslatorOpts &translatorOpts();
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 04a09ee84a72e..004fa07145b48 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -933,6 +933,29 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
 
   Binary.addProperty(std::move(ArgMaskPropSet));
 
+  if (Format == ::jit_compiler::BinaryFormat::PTX) {
+    // Add a program metadata property with the reqd_work_group_size attribute.
+    // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference.
+    auto ReqdWGS = std::find_if(
+        FusedKernelInfo.Attributes.begin(), FusedKernelInfo.Attributes.end(),
+        [](const ::jit_compiler::SYCLKernelAttribute &Attr) {
+          return Attr.AttributeName == "reqd_work_group_size";
+        });
+    if (ReqdWGS != FusedKernelInfo.Attributes.end()) {
+      auto Encoded = encodeReqdWorkGroupSize(*ReqdWGS);
+      std::stringstream PropName;
+      PropName << FusedKernelInfo.Name;
+      PropName << __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE;
+      PropertyContainer ReqdWorkGroupSizeProp{
+          PropName.str(), Encoded.data(), Encoded.size(),
+          pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY};
+      PropertySetContainer ProgramMetadata{
+          __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA};
+      ProgramMetadata.addProperty(std::move(ReqdWorkGroupSizeProp));
+      Binary.addProperty(std::move(ProgramMetadata));
+    }
+  }
+
   DeviceBinariesCollection Collection;
   Collection.addDeviceBinary(
       std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart,
@@ -972,6 +995,23 @@ std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
   return Encoded;
 }
 
+std::vector<uint8_t> jit_compiler::encodeReqdWorkGroupSize(
+    const ::jit_compiler::SYCLKernelAttribute &Attr) const {
+  assert(Attr.AttributeName == "reqd_work_group_size");
+  size_t NumBytes = sizeof(uint64_t) + (Attr.Values.size() * sizeof(uint32_t));
+  std::vector<uint8_t> Encoded(NumBytes, 0u);
+  uint8_t *Ptr = Encoded.data();
+  // Skip 64-bit wide size argument with value 0 at the start of the data.
+  // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference.
+  Ptr += sizeof(uint64_t);
+  for (const auto &Val : Attr.Values) {
+    uint32_t UVal = std::stoul(Val);
+    std::memcpy(Ptr, &UVal, sizeof(uint32_t));
+    Ptr += sizeof(uint32_t);
+  }
+  return Encoded;
+}
+
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index e02be562de3ee..4b299572bacbc 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -16,6 +16,7 @@ namespace jit_compiler {
 enum class BinaryFormat;
 class JITContext;
 struct SYCLKernelInfo;
+struct SYCLKernelAttribute;
 using ArgUsageMask = std::vector<unsigned char>;
 } // namespace jit_compiler
 
@@ -47,11 +48,15 @@ class jit_compiler {
   jit_compiler &operator=(const jit_compiler &&) = delete;
 
   pi_device_binaries
-  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, ::jit_compiler::BinaryFormat Format);
+  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
+                       ::jit_compiler::BinaryFormat Format);
 
   std::vector<uint8_t>
   encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const;
 
+  std::vector<uint8_t> encodeReqdWorkGroupSize(
+      const ::jit_compiler::SYCLKernelAttribute &Attr) const;
+
   // Manages the lifetime of the PI structs for device binaries.
   std::vector<DeviceBinariesCollection> JITDeviceBinaries;
 
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
index 59530a0e691ce..d0cc9e824bc07 100644
--- a/sycl/source/detail/jit_device_binaries.cpp
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -81,7 +81,8 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) {
 }
 
 pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
-    const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec, pi_device_binary_type Format) {
+    const unsigned char *BinaryStart, size_t BinarySize, const char *TargetSpec,
+    pi_device_binary_type Format) {
   pi_device_binary_struct DeviceBinary;
   DeviceBinary.Version = PI_DEVICE_BINARY_VERSION;
   DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL;
@@ -106,7 +107,8 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
 void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont,
                                                const unsigned char *BinaryStart,
                                                size_t BinarySize,
-                                               const char* TargetSpec, pi_device_binary_type Format) {
+                                               const char *TargetSpec,
+                                               pi_device_binary_type Format) {
   // Adding to the vectors might trigger reallocation, which would invalidate
   // the pointers used for PI structs if a PI struct has already been created
   // via getPIDeviceStruct(). Forbid calls to this method after the first PI
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
index 96079d3a25a19..cecab17870650 100644
--- a/sycl/source/detail/jit_device_binaries.hpp
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -111,7 +111,8 @@ class DeviceBinaryContainer {
 
   pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart,
                                             size_t BinarySize,
-                                            const char* TargetSpec, pi_device_binary_type Format);
+                                            const char *TargetSpec,
+                                            pi_device_binary_type Format);
 
 private:
   bool Fused = true;
@@ -138,7 +139,7 @@ class DeviceBinariesCollection {
 
   void addDeviceBinary(DeviceBinaryContainer &&Cont,
                        const unsigned char *BinaryStart, size_t BinarySize,
-                       const char* TargetSpec, pi_device_binary_type Format);
+                       const char *TargetSpec, pi_device_binary_type Format);
   pi_device_binaries getPIDeviceStruct();
 
 private:

From 1559b85b00a17ad2eadb71fe229217f704038449 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 14 Feb 2023 13:39:10 +0000
Subject: [PATCH 11/25] [SYCL][Fusion] Cache and groom input binaries

Parse each input binary only once.

Groom the nvvm annotations for functions deleted before fusion.

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 .../jit-compiler/lib/fusion/ModuleHelper.cpp  |   8 ++
 .../lib/translation/KernelTranslation.cpp     | 104 +++++++++---------
 2 files changed, 60 insertions(+), 52 deletions(-)

diff --git a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
index c197fedf47e69..0a90600530f74 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
@@ -8,6 +8,7 @@
 
 #include "ModuleHelper.h"
 
+#include "target/TargetFusionInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -22,6 +23,13 @@ helper::ModuleHelper::cloneAndPruneModule(Module *Mod,
   SmallPtrSet<Function *, 16> UnusedFunctions;
   identifyUnusedFunctions(Mod, CGRoots, UnusedFunctions);
 
+  {
+    auto TFI = llvm::TargetFusionInfo::getTargetFusionInfo(Mod);
+    SmallVector<Function *> Unused{UnusedFunctions.begin(),
+                                   UnusedFunctions.end()};
+    TFI.notifyFunctionsDelete(Unused);
+  }
+
   // Clone the module, but use an external reference in place of the global
   // definition for unused functions.
   auto FunctionCloneMask = [&](const GlobalValue *GV) -> bool {
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 1436f51f40aaa..b877829782658 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -75,68 +75,68 @@ KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx,
     const unsigned char *ModulePtr = BinInfo.BinaryStart;
     size_t ModuleSize = BinInfo.BinarySize;
     BinaryBlob BinBlob{ModulePtr, ModuleSize};
-    if (ParsedBinaries.contains(BinBlob)) {
+    if (!ParsedBinaries.contains(BinBlob)) {
       // Multiple kernels can be stored in the same SPIR-V or LLVM IR module.
-      // If we encountered the same binary module before, skip.
+      // We only load if we did not encounter the same binary module before.
       // NOTE: We compare the pointer as well as the size, in case
       // a previous kernel only referenced part of the SPIR-V/LLVM IR module.
       // Not sure this can actually happen, but better safe than sorry.
-      continue;
-    }
-    // Simply load and translate the SPIR-V into the currently still empty
-    // module.
-    std::unique_ptr<llvm::Module> NewMod;
+      // Simply load and translate the SPIR-V into the currently still empty
+      // module.
+      std::unique_ptr<llvm::Module> NewMod;
 
-    switch (BinInfo.Format) {
-    case BinaryFormat::LLVM: {
-      auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel);
-      if (auto Err = ModOrError.takeError()) {
-        return std::move(Err);
+      switch (BinInfo.Format) {
+      case BinaryFormat::LLVM: {
+        auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel);
+        if (auto Err = ModOrError.takeError()) {
+          return std::move(Err);
+        }
+        NewMod = std::move(*ModOrError);
+        break;
       }
-      NewMod = std::move(*ModOrError);
-      break;
-    }
-    case BinaryFormat::SPIRV: {
-      auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel);
-      if (auto Err = ModOrError.takeError()) {
-        return std::move(Err);
+      case BinaryFormat::SPIRV: {
+        auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel);
+        if (auto Err = ModOrError.takeError()) {
+          return std::move(Err);
+        }
+        NewMod = std::move(*ModOrError);
+        break;
       }
-      NewMod = std::move(*ModOrError);
-      break;
-    }
-    default: {
-      return createStringError(
-          inconvertibleErrorCode(),
-          "Failed to load kernel from unsupported input format");
-    }
-    }
-
-    // We do not assume that the input binary information has the address bits
-    // set, but rather retrieve this information from the SPIR-V/LLVM module's
-    // data-layout.
-    BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
-
-    if (First) {
-      // We can simply assign the module we just loaded from SPIR-V to the
-      // empty pointer on the first iteration.
-      Result = std::move(NewMod);
-      // The first module will dictate the address bits for the remaining.
-      AddressBits = BinInfo.AddressBits;
-      First = false;
-    } else {
-      // We have already loaded some module, so now we need to
-      // link the module we just loaded with the result so far.
-      // FIXME: We allow duplicates to be overridden by the module
-      // read last. This could cause problems if different modules contain
-      // definitions with the same name, but different body/content.
-      // Check that this is not problematic.
-      Linker::linkModules(*Result, std::move(NewMod),
-                          Linker::Flags::OverrideFromSrc);
-      if (AddressBits != BinInfo.AddressBits) {
+      default: {
         return createStringError(
             inconvertibleErrorCode(),
-            "Number of address bits between SPIR-V modules does not match");
+            "Failed to load kernel from unsupported input format");
+      }
+      }
+
+      // We do not assume that the input binary information has the address bits
+      // set, but rather retrieve this information from the SPIR-V/LLVM module's
+      // data-layout.
+      BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits();
+
+      if (First) {
+        // We can simply assign the module we just loaded from SPIR-V to the
+        // empty pointer on the first iteration.
+        Result = std::move(NewMod);
+        // The first module will dictate the address bits for the remaining.
+        AddressBits = BinInfo.AddressBits;
+        First = false;
+      } else {
+        // We have already loaded some module, so now we need to
+        // link the module we just loaded with the result so far.
+        // FIXME: We allow duplicates to be overridden by the module
+        // read last. This could cause problems if different modules contain
+        // definitions with the same name, but different body/content.
+        // Check that this is not problematic.
+        Linker::linkModules(*Result, std::move(NewMod),
+                            Linker::Flags::OverrideFromSrc);
+        if (AddressBits != BinInfo.AddressBits) {
+          return createStringError(
+              inconvertibleErrorCode(),
+              "Number of address bits between SPIR-V modules does not match");
+        }
       }
+      ParsedBinaries.insert(BinBlob);
     }
     // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or
     // 'work_group_size_hint' from metadata attached to the kernel function and

From fd341247108ad692526d4b22719b0b2d66350262 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 14 Feb 2023 15:44:25 +0000
Subject: [PATCH 12/25] [SYCL][Fusion] Disable heterogeneous ND ranges on CUDA

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl-fusion/jit-compiler/CMakeLists.txt       |  1 +
 sycl-fusion/jit-compiler/include/JITContext.h |  1 -
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 15 ++++++++++-----
 sycl-fusion/passes/CMakeLists.txt             |  1 +
 sycl-fusion/passes/kernel-fusion/Builtins.cpp | 12 ------------
 sycl-fusion/passes/kernel-fusion/Builtins.h   |  4 ----
 sycl/source/detail/jit_compiler.hpp           |  2 +-
 7 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index f67eed5fc6c76..777f06b303e0c 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_library(sycl-fusion
    ScalarOpts
    InstCombine
    Target
+   TargetParser
    NVPTX
    X86
    MC
diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index c26312ceb70e6..eda8c83d8b21c 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -88,7 +88,6 @@ class JITContext {
   mutable MutexT CacheMutex;
 
   std::unordered_map<CacheKeyT, SYCLKernelInfo> Cache;
-  
 };
 } // namespace jit_compiler
 
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 821ad4cd36369..e2f321bbcb81b 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -56,6 +56,10 @@ FusionResult KernelFusion::fuseKernels(
     int BarriersFlags,
     const std::vector<jit_compiler::ParameterInternalization> &Internalization,
     const std::vector<jit_compiler::JITConstant> &Constants) {
+  // Initialize the configuration helper to make the options for this invocation
+  // available (on a per-thread basis).
+  ConfigHelper::setConfig(std::move(JITConfig));
+
   const auto NDRanges = gatherNDRanges(KernelInformation);
 
   if (!isValidCombination(NDRanges)) {
@@ -64,9 +68,12 @@ FusionResult KernelFusion::fuseKernels(
         "different global sizes in dimensions [2, N) and non-zero offsets"};
   }
 
-  // Initialize the configuration helper to make the options for this invocation
-  // available (on a per-thread basis).
-  ConfigHelper::setConfig(std::move(JITConfig));
+  bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges);
+
+  BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
+  if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) {
+    return FusionResult{"Heterogeneous ND ranges not supported for CUDA"};
+  }
 
   bool CachingEnabled = ConfigHelper::get<option::JITEnableCaching>();
   CacheKeyT CacheKey{KernelsToFuse,
@@ -137,8 +144,6 @@ FusionResult KernelFusion::fuseKernels(
 
   SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName);
 
-  BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
-
   if (auto Error = translation::KernelTranslator::translateKernel(
           FusedKernelInfo, *NewMod, JITCtx, TargetFormat)) {
     return errorToFusionResult(std::move(Error),
diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index c74c3c40e34e0..95f504091dc30 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -46,6 +46,7 @@ add_llvm_library(SYCLKernelFusionPasses
   Support
   TransformUtils
   Passes
+  TargetParser
 )
 
 target_include_directories(SYCLKernelFusionPasses
diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.cpp b/sycl-fusion/passes/kernel-fusion/Builtins.cpp
index 6de1c40e8f4cd..9cff120cd5888 100644
--- a/sycl-fusion/passes/kernel-fusion/Builtins.cpp
+++ b/sycl-fusion/passes/kernel-fusion/Builtins.cpp
@@ -595,18 +595,6 @@ jit_compiler::Remapper::remapBuiltins(Function *F, const NDRange &SrcNDRange,
   return Clone;
 }
 
-void jit_compiler::barrierCall(IRBuilderBase &Builder, int Flags) {
-  assert((Flags == 1 || Flags == 2 || Flags == 3) && "Invalid barrier flags");
-
-  // See
-  // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
-  createSPIRVCall(Builder, BarrierName,
-                  {Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-                   Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-                   Builder.getInt32(0x10 | (Flags % 2 == 1 ? 0x100 : 0x0) |
-                                    ((Flags >> 1 == 1 ? 0x200 : 0x0)))});
-}
-
 Value *jit_compiler::createSPIRVCall(IRBuilderBase &Builder,
                                      StringRef FunctionName,
                                      ArrayRef<Value *> Args) {
diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.h b/sycl-fusion/passes/kernel-fusion/Builtins.h
index bc8e186f14188..a55b6efab39f9 100644
--- a/sycl-fusion/passes/kernel-fusion/Builtins.h
+++ b/sycl-fusion/passes/kernel-fusion/Builtins.h
@@ -60,10 +60,6 @@ constexpr llvm::StringLiteral OffloadStartWrapperName{
 llvm::Value *getGlobalLinearID(llvm::IRBuilderBase &Builder,
                                const NDRange &FusedNDRange);
 
-///
-/// Creates a call to a barrier function.
-void barrierCall(llvm::IRBuilderBase &Builder, int Flags);
-
 ///
 /// @return A call to a SPIRV function, which will be declared if not already in
 /// the module.
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 4b299572bacbc..71a57723ea6bb 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -13,7 +13,7 @@
 #include <detail/scheduler/scheduler.hpp>
 
 namespace jit_compiler {
-enum class BinaryFormat;
+enum class BinaryFormat : uint32_t;
 class JITContext;
 struct SYCLKernelInfo;
 struct SYCLKernelAttribute;

From fc5efbcf35937b83e3dceb31c48230463431e163 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 14 Feb 2023 16:14:50 +0000
Subject: [PATCH 13/25] [SYCL][Fusion] Enable JIT caching for CUDA fusion

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/source/detail/jit_compiler.cpp | 11 ++++++++---
 sycl/source/detail/jit_compiler.hpp |  4 ++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 004fa07145b48..a7de0dd12f1a4 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -859,9 +859,14 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
     auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat);
     detail::ProgramManager::getInstance().addImages(PIDeviceBinaries);
     Handle = OSUtil::getOSModuleHandle(PIDeviceBinaries->DeviceBinaries);
-  } else if (DebugEnabled) {
-    // TODO(Lukas): Create correct OSModuleHandle when using a cached binary.
-    std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
+    CachedModules.emplace(FusedKernelInfo.Name, Handle);
+  } else {
+    if (DebugEnabled) {
+      std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
+    }
+    // Retrieve an OSModuleHandle for the cached binary.
+    assert(CachedModules.count(FusedKernelInfo.Name) && "No cached binary");
+    Handle = CachedModules.at(FusedKernelInfo.Name);
   }
 
   // Create a kernel bundle for the fused kernel.
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 71a57723ea6bb..fae774cadd09a 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -12,6 +12,8 @@
 #include <detail/scheduler/commands.hpp>
 #include <detail/scheduler/scheduler.hpp>
 
+#include <unordered_map>
+
 namespace jit_compiler {
 enum class BinaryFormat : uint32_t;
 class JITContext;
@@ -60,6 +62,8 @@ class jit_compiler {
   // Manages the lifetime of the PI structs for device binaries.
   std::vector<DeviceBinariesCollection> JITDeviceBinaries;
 
+  std::unordered_map<std::string, OSModuleHandle> CachedModules;
+
   std::unique_ptr<::jit_compiler::JITContext> MJITContext;
 };
 

From 6c14311e00f3a294ca18dbf846c7cc76683ba7ff Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 15 Feb 2023 12:56:58 +0000
Subject: [PATCH 14/25] [SYCL][Fusion] Catch empty standard arguments

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/source/detail/jit_compiler.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index a7de0dd12f1a4..ffcdca25584f7 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -457,7 +457,10 @@ static ParamIterator preProcessArguments(
     // which will go out-of-scope before we execute the fused kernel. Therefore,
     // we need to copy the argument to a permant location and update the
     // argument.
-    Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+    if (Arg->Arg.MPtr) {
+      Arg->Arg.MPtr =
+          storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+    }
     // Standard layout arguments do not participate in identical argument
     // detection, but we still add it to the list here. As the SYCL runtime can
     // only check the raw bytes for identical content, but is unaware of the
@@ -474,6 +477,7 @@ static ParamIterator preProcessArguments(
         ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
         Arg->Arg.MPtr, Arg->Arg.MSize);
     return ++Arg;
+    
   }
   // First check if there's already another parameter with identical
   // value.

From 980d36d539f70ac10cb9511180a8f2e332d4a48c Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 7 Mar 2023 14:03:28 +0000
Subject: [PATCH 15/25] [SYCL][Fusion] Rebase and address feedback

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 clang/include/clang/Driver/Action.h           |   2 +-
 clang/lib/Driver/Action.cpp                   |   2 +-
 sycl-fusion/jit-compiler/CMakeLists.txt       |   7 +-
 sycl-fusion/jit-compiler/include/JITContext.h |   7 +-
 sycl-fusion/jit-compiler/include/Options.h    |   1 +
 sycl-fusion/jit-compiler/lib/JITContext.cpp   |  11 +-
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |  22 +
 .../lib/fusion/FusionPipeline.cpp             |   4 +-
 .../jit-compiler/lib/fusion/ModuleHelper.cpp  |   2 +-
 .../lib/translation/KernelTranslation.cpp     |  43 +-
 .../lib/translation/KernelTranslation.h       |  10 +-
 .../lib/translation/SPIRVLLVMTranslation.cpp  |   2 +-
 .../internalization/Internalization.cpp       |   2 +-
 .../passes/kernel-fusion/SYCLKernelFusion.cpp |  13 +-
 sycl-fusion/passes/syclcp/SYCLCP.cpp          |   2 +-
 .../passes/target/TargetFusionInfo.cpp        | 458 +++++++++++-------
 sycl-fusion/passes/target/TargetFusionInfo.h  | 131 +----
 sycl/source/detail/device_info.hpp            |  19 +-
 sycl/source/detail/jit_compiler.cpp           |  38 +-
 sycl/source/detail/scheduler/commands.hpp     |   2 +-
 20 files changed, 416 insertions(+), 362 deletions(-)

diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index 9dab2e32b2ffc..45e9133b7ed13 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -665,7 +665,7 @@ class OffloadWrapperJobAction : public JobAction {
 public:
   OffloadWrapperJobAction(ActionList &Inputs, types::ID Type);
   OffloadWrapperJobAction(Action *Input, types::ID OutputType,
-                          bool IsEmbeddedIR = false);
+                          bool EmbedIR = false);
 
   bool isEmbeddedIR() const { return EmbedIR; }
 
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index f55233c59a6db..4cb0225cad293 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -478,7 +478,7 @@ void OffloadWrapperJobAction::anchor() {}
 
 OffloadWrapperJobAction::OffloadWrapperJobAction(ActionList &Inputs,
                                                  types::ID Type)
-  : JobAction(OffloadWrapperJobClass, Inputs, Type) {}
+    : JobAction(OffloadWrapperJobClass, Inputs, Type), EmbedIR(false) {}
 
 OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input, types::ID Type,
                                                  bool IsEmbeddedIR)
diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index 777f06b303e0c..bf323239679b4 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -22,9 +22,8 @@ add_llvm_library(sycl-fusion
    InstCombine
    Target
    TargetParser
-   NVPTX
-   X86
    MC
+   ${LLVM_TARGETS_TO_BUILD}
 )
 
 target_include_directories(sycl-fusion
@@ -47,6 +46,10 @@ target_link_libraries(sycl-fusion
   ${CMAKE_THREAD_LIBS_INIT}
 )
 
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_PTX)
+endif()
+
 if (BUILD_SHARED_LIBS)
   if(NOT MSVC AND NOT APPLE)
     # Manage symbol visibility through the linker to make sure no LLVM symbols
diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index eda8c83d8b21c..69465a74d8371 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -39,7 +39,7 @@ using CacheKeyT =
 /// Wrapper around a kernel binary.
 class KernelBinary {
 public:
-  explicit KernelBinary(std::string Binary, BinaryFormat Format);
+  explicit KernelBinary(std::string &&Binary, BinaryFormat Format);
 
   jit_compiler::BinaryAddress address() const;
 
@@ -65,7 +65,10 @@ class JITContext {
 
   llvm::LLVMContext *getLLVMContext();
 
-  KernelBinary &emplaceSPIRVBinary(std::string Binary, BinaryFormat Format);
+  template <typename... Ts> KernelBinary &emplaceKernelBinary(Ts &&...Args) {
+    WriteLockT WriteLock{BinariesMutex};
+    return Binaries.emplace_back(std::forward<Ts>(Args)...);
+  }
 
   std::optional<SYCLKernelInfo> getCacheEntry(CacheKeyT &Identifier) const;
 
diff --git a/sycl-fusion/jit-compiler/include/Options.h b/sycl-fusion/jit-compiler/include/Options.h
index 4fe7787df00db..841a229adb7a3 100644
--- a/sycl-fusion/jit-compiler/include/Options.h
+++ b/sycl-fusion/jit-compiler/include/Options.h
@@ -10,6 +10,7 @@
 #define SYCL_FUSION_JIT_COMPILER_OPTIONS_H
 
 #include "Kernel.h"
+
 #include <memory>
 #include <unordered_map>
 
diff --git a/sycl-fusion/jit-compiler/lib/JITContext.cpp b/sycl-fusion/jit-compiler/lib/JITContext.cpp
index e1dda8b928c45..4499dd93f4d76 100644
--- a/sycl-fusion/jit-compiler/lib/JITContext.cpp
+++ b/sycl-fusion/jit-compiler/lib/JITContext.cpp
@@ -11,7 +11,7 @@
 
 using namespace jit_compiler;
 
-KernelBinary::KernelBinary(std::string Binary, BinaryFormat Fmt)
+KernelBinary::KernelBinary(std::string &&Binary, BinaryFormat Fmt)
     : Blob{std::move(Binary)}, Format{Fmt} {}
 
 jit_compiler::BinaryAddress KernelBinary::address() const {
@@ -29,15 +29,6 @@ JITContext::~JITContext() = default;
 
 llvm::LLVMContext *JITContext::getLLVMContext() { return LLVMCtx.get(); }
 
-KernelBinary &JITContext::emplaceSPIRVBinary(std::string Binary,
-                                             BinaryFormat Format) {
-  WriteLockT WriteLock{BinariesMutex};
-  // NOTE: With C++17, which returns a reference from emplace_back, the
-  // following code would be even simpler.
-  Binaries.emplace_back(std::move(Binary), Format);
-  return Binaries.back();
-}
-
 std::optional<SYCLKernelInfo>
 JITContext::getCacheEntry(CacheKeyT &Identifier) const {
   ReadLockT ReadLock{CacheMutex};
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index e2f321bbcb81b..be7515d935247 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -48,6 +48,22 @@ gatherNDRanges(llvm::ArrayRef<SYCLKernelInfo> KernelInformation) {
   return NDRanges;
 }
 
+static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
+  switch (TargetFormat) {
+  case BinaryFormat::SPIRV:
+    return true;
+  case BinaryFormat::PTX: {
+#ifdef FUSION_JIT_SUPPORT_PTX
+    return true;
+#else  // FUSION_JIT_SUPPORT_PTX
+    return false;
+#endif // FUSION_JIT_SUPPORT_PTX
+  }
+  default:
+    return false;
+  }
+}
+
 FusionResult KernelFusion::fuseKernels(
     JITContext &JITCtx, Config &&JITConfig,
     const std::vector<SYCLKernelInfo> &KernelInformation,
@@ -71,6 +87,12 @@ FusionResult KernelFusion::fuseKernels(
   bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges);
 
   BinaryFormat TargetFormat = ConfigHelper::get<option::JITTargetFormat>();
+
+  if (!isTargetFormatSupported(TargetFormat)) {
+    return FusionResult(
+        "Fusion output target format not supported by this build");
+  }
+
   if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) {
     return FusionResult{"Heterogeneous ND ranges not supported for CUDA"};
   }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index fb2e2b15200a7..965202fe5bbfa 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -23,8 +23,8 @@
 #ifndef NDEBUG
 #include "llvm/IR/Verifier.h"
 #endif // NDEBUG
-#include "llvm/ADT/Triple.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
@@ -103,7 +103,7 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
     // to/from generic address-space as possible, because these hinder
     // internalization.
     // Ideally, the static compiler should have performed that job.
-    unsigned FlatAddressSpace = getFlatAddressSpace(Mod);
+    const unsigned FlatAddressSpace = getFlatAddressSpace(Mod);
     FPM.addPass(InferAddressSpacesPass(FlatAddressSpace));
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
index 0a90600530f74..0d6ac7f48fbbe 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp
@@ -24,7 +24,7 @@ helper::ModuleHelper::cloneAndPruneModule(Module *Mod,
   identifyUnusedFunctions(Mod, CGRoots, UnusedFunctions);
 
   {
-    auto TFI = llvm::TargetFusionInfo::getTargetFusionInfo(Mod);
+    TargetFusionInfo TFI{Mod};
     SmallVector<Function *> Unused{UnusedFunctions.begin(),
                                    UnusedFunctions.end()};
     TFI.notifyFunctionsDelete(Unused);
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index b877829782658..8b1f3d913026d 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "KernelTranslation.h"
+
 #include "SPIRVLLVMTranslation.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Constants.h"
@@ -182,7 +183,8 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
     break;
   }
   case BinaryFormat::PTX: {
-    llvm::Expected<KernelBinary *> BinaryOrError = translateToPTX(Mod, JITCtx);
+    llvm::Expected<KernelBinary *> BinaryOrError =
+        translateToPTX(Kernel, Mod, JITCtx);
     if (auto Error = BinaryOrError.takeError()) {
       return Error;
     }
@@ -215,12 +217,20 @@ KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) {
 }
 
 llvm::Expected<KernelBinary *>
-KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
-  // FIXME: Can we limit this to the NVPTX specific target?
-  llvm::InitializeAllTargets();
-  llvm::InitializeAllAsmParsers();
-  llvm::InitializeAllAsmPrinters();
-  llvm::InitializeAllTargetMCs();
+KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
+                                 JITContext &JITCtx) {
+#ifndef FUSION_JIT_SUPPORT_PTX
+  return createStringError(inconvertibleErrorCode(),
+                           "PTX translation not supported in this build");
+#else  // FUSION_JIT_SUPPORT_PTX
+  LLVMInitializeNVPTXTargetInfo();
+  LLVMInitializeNVPTXTarget();
+  LLVMInitializeNVPTXAsmPrinter();
+  LLVMInitializeNVPTXTargetMC();
+#endif // FUSION_JIT_SUPPORT_PTX
+
+  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
+  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
 
   std::string TargetTriple{"nvptx64-nvidia-cuda"};
 
@@ -231,13 +241,26 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
   if (!Target) {
     return createStringError(
         inconvertibleErrorCode(),
-        "Failed to load and translate SPIR-V module with error %s",
+        "Failed to load and translate PTX LLVM IR module with error %s",
         ErrorMessage.c_str());
   }
 
+  llvm::StringRef TargetCPU{"sm_50"};
+  llvm::StringRef TargetFeatures{"+sm_50,+ptx76"};
+  if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name)) {
+    if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      TargetCPU =
+          KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+    }
+    if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                           .getValueAsString();
+    }
+  }
+
   // FIXME: Check whether we can provide more accurate target information here
   auto *TargetMachine = Target->createTargetMachine(
-      TargetTriple, "sm_50", "+sm_50,+ptx76", {}, llvm::Reloc::PIC_,
+      TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
       std::nullopt, llvm::CodeGenOpt::Default);
 
   llvm::legacy::PassManager PM;
@@ -259,5 +282,5 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) {
     ASMStream.flush();
   }
 
-  return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX);
+  return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX);
 }
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
index 276b6e536d269..7e4816df9bf94 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+#ifndef SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H
+#define SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H
 
 #include "JITContext.h"
 #include "Kernel.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include <llvm/Support/Error.h>
+#include "llvm/Support/Error.h"
 #include <vector>
 
 namespace jit_compiler {
@@ -39,8 +41,10 @@ class KernelTranslator {
   static llvm::Expected<KernelBinary *> translateToSPIRV(llvm::Module &Mod,
                                                          JITContext &JITCtx);
 
-  static llvm::Expected<KernelBinary *> translateToPTX(llvm::Module &Mod,
-                                                       JITContext &JITCtx);
+  static llvm::Expected<KernelBinary *>
+  translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx);
 };
 } // namespace translation
 } // namespace jit_compiler
+
+#endif // SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H
diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
index a5effb33a85e1..4092f9dd96fc8 100644
--- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp
@@ -94,5 +94,5 @@ SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) {
         "Translation of LLVM IR to SPIR-V failed with error %s",
         ErrMsg.c_str());
   }
-  return &JITCtx.emplaceSPIRVBinary(BinaryStream.str(), BinaryFormat::SPIRV);
+  return &JITCtx.emplaceKernelBinary(BinaryStream.str(), BinaryFormat::SPIRV);
 }
diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp
index bca46ba43acb3..3d5c38c799e8f 100644
--- a/sycl-fusion/passes/internalization/Internalization.cpp
+++ b/sycl-fusion/passes/internalization/Internalization.cpp
@@ -631,7 +631,7 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM,
 
 PreservedAnalyses llvm::SYCLInternalizer::run(Module &M,
                                               ModuleAnalysisManager &AM) {
-  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+  TargetFusionInfo TFI{&M};
   // Private promotion
   const PreservedAnalyses Tmp = SYCLInternalizerImpl{
       TFI.getPrivateAddressSpace(), PrivatePromotion, true, TFI}(M, AM);
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
index a0079affbaa06..b61a0936eb32e 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
@@ -138,7 +138,7 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) {
       AM.getResult<SYCLModuleInfoAnalysis>(M).ModuleInfo;
   assert(ModuleInfo && "No module information available");
 
-  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+  TargetFusionInfo TFI{&M};
 
   // Iterate over the functions in the module and locate all
   // stub functions identified by metadata.
@@ -456,11 +456,20 @@ Error SYCLKernelFusion::fuseKernel(
       FT, GlobalValue::LinkageTypes::ExternalLinkage,
       M.getDataLayout().getProgramAddressSpace(), KernelName->getString(), &M);
   {
+    auto DefaultAttr = FusedFunction->getAttributes();
+    // Add uniform function attributes, i.e., attributes with identical value on
+    // each input function, to the fused function.
+    auto *FirstFunction = InputFunctions.front().F;
+    for (const auto &UniformKey : TargetInfo.getUniformKernelAttributes()) {
+      if (FirstFunction->hasFnAttribute(UniformKey)) {
+        DefaultAttr = DefaultAttr.addFnAttribute(
+            LLVMCtx, FirstFunction->getFnAttribute(UniformKey));
+      }
+    }
     // Add the collected parameter attributes to the fused function.
     // Copying the parameter attributes from their original definition in the
     // input kernels should be safe and they most likely can't be deducted later
     // on, as no caller is present in the module.
-    auto DefaultAttr = FusedFunction->getAttributes();
     auto FusedFnAttrs =
         AttributeList::get(LLVMCtx, DefaultAttr.getFnAttrs(),
                            DefaultAttr.getRetAttrs(), FusedParamAttributes);
diff --git a/sycl-fusion/passes/syclcp/SYCLCP.cpp b/sycl-fusion/passes/syclcp/SYCLCP.cpp
index 5cb9b00433ef7..b928a33760da0 100644
--- a/sycl-fusion/passes/syclcp/SYCLCP.cpp
+++ b/sycl-fusion/passes/syclcp/SYCLCP.cpp
@@ -249,7 +249,7 @@ PreservedAnalyses SYCLCP::run(Module &M, ModuleAnalysisManager &AM) {
     Changed = propagateConstants(F, *ConstantsOrErr) || Changed;
   }
 
-  auto TFI = TargetFusionInfo::getTargetFusionInfo(&M);
+  TargetFusionInfo TFI{&M};
 
   if (Changed) {
     moduleCleanup(M, AM, TFI);
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
index c17e2464ac323..27514bbb80269 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.cpp
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -7,234 +7,336 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetFusionInfo.h"
-#include "llvm/ADT/Triple.h"
+
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/TargetParser/Triple.h"
 
-using namespace llvm;
+namespace llvm {
+class TargetFusionInfoImpl {
 
-//
-// TargetFusionInfo
-//
+public:
+  explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {};
 
-TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) {
-  llvm::Triple Tri(Mod->getTargetTriple());
-  if (Tri.isNVPTX()) {
-    return TargetFusionInfo(
-        std::shared_ptr<NVPTXTargetFusionInfo>(new NVPTXTargetFusionInfo(Mod)));
-  }
-  if (Tri.isSPIRV() || Tri.isSPIR()) {
-    return TargetFusionInfo(
-        std::shared_ptr<SPIRVTargetFusionInfo>(new SPIRVTargetFusionInfo(Mod)));
-  }
-  assert(false && "Unsupported target for fusion");
-}
+  virtual ~TargetFusionInfoImpl() = default;
+
+  virtual void notifyFunctionsDelete(
+      [[maybe_unused]] llvm::ArrayRef<Function *> Funcs) const {}
+
+  virtual void addKernelFunction([[maybe_unused]] Function *KernelFunc) const {}
+
+  virtual void postProcessKernel([[maybe_unused]] Function *KernelFunc) const {}
+
+  virtual ArrayRef<StringRef> getKernelMetadataKeys() const { return {}; }
+
+  virtual ArrayRef<StringRef> getUniformKernelAttributes() const { return {}; }
+
+  virtual void createBarrierCall(IRBuilderBase &Builder,
+                                 int BarrierFlags) const = 0;
+
+  virtual unsigned getPrivateAddressSpace() const = 0;
+
+  virtual unsigned getLocalAddressSpace() const = 0;
+
+  virtual void
+  updateAddressSpaceMetadata([[maybe_unused]] Function *KernelFunc,
+                             [[maybe_unused]] ArrayRef<size_t> LocalSize,
+                             [[maybe_unused]] unsigned AddressSpace) const {}
+
+protected:
+  llvm::Module *LLVMMod;
+};
 
 //
 // SPIRVTargetFusionInfo
 //
+class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
 
-void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) const {
-  KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL);
-}
-
-ArrayRef<StringRef> SPIRVTargetFusionInfo::getKernelMetadataKeys() const {
-  // NOTE: We do not collect the "kernel_arg_name" metadata, because
-  // the kernel arguments receive new names in the fused kernel.
-  static SmallVector<StringRef> Keys{
-      {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type",
-       "kernel_arg_base_type", "kernel_arg_type_qual"}};
-  return Keys;
-}
+  void addKernelFunction(Function *KernelFunc) const override {
+    KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL);
+  }
 
-void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) const {
-  // Attach the kernel_arg_name metadata.
-  SmallVector<Metadata *, 16> KernelArgNames;
-  for (auto &P : KernelFunc->args()) {
-    KernelArgNames.push_back(MDString::get(LLVMMod->getContext(), P.getName()));
+  ArrayRef<StringRef> getKernelMetadataKeys() const override {
+    // NOTE: We do not collect the "kernel_arg_name" metadata, because
+    // the kernel arguments receive new names in the fused kernel.
+    static SmallVector<StringRef> Keys{
+        {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type",
+         "kernel_arg_base_type", "kernel_arg_type_qual"}};
+    return Keys;
   }
-  auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames);
-  KernelFunc->setMetadata("kernel_arg_name", ArgNameMD);
-
-  static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
-  static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
-  // Remove all existing calls of the ITT instrumentation functions. Insert new
-  // ones in the entry block of the fused kernel and every exit block if the
-  // functions are present in the module.
-  // We cannot use the existing SPIRITTAnnotations pass, because that pass might
-  // insert calls to functions not present in the module (e.g., ITT
-  // instrumentations for barriers). As the JITed module is not linked with
-  // libdevice anymore, the functions would remain unresolved and cause the
-  // driver to fail.
-  Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper);
-  Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper);
-  bool InsertWrappers =
-      ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
-       (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
-  auto *WrapperFuncTy = FunctionType::get(
-      Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false);
-  for (auto &BB : *KernelFunc) {
-    for (auto Inst = BB.begin(); Inst != BB.end();) {
-      if (auto *CB = dyn_cast<CallBase>(Inst)) {
-        if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
-          Inst = Inst->eraseFromParent();
-          continue;
+
+  void postProcessKernel(Function *KernelFunc) const override {
+    // Attach the kernel_arg_name metadata.
+    SmallVector<Metadata *> KernelArgNames;
+    for (auto &P : KernelFunc->args()) {
+      KernelArgNames.push_back(
+          MDString::get(LLVMMod->getContext(), P.getName()));
+    }
+    auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames);
+    KernelFunc->setMetadata("kernel_arg_name", ArgNameMD);
+
+    static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper";
+    static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper";
+    // Remove all existing calls of the ITT instrumentation functions. Insert
+    // new ones in the entry block of the fused kernel and every exit block if
+    // the functions are present in the module. We cannot use the existing
+    // SPIRITTAnnotations pass, because that pass might insert calls to
+    // functions not present in the module (e.g., ITT instrumentations for
+    // barriers). As the JITed module is not linked with libdevice anymore, the
+    // functions would remain unresolved and cause the driver to fail.
+    Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper);
+    Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper);
+    bool InsertWrappers =
+        ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) &&
+         (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration()));
+    auto *WrapperFuncTy = FunctionType::get(
+        Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false);
+    for (auto &BB : *KernelFunc) {
+      for (auto Inst = BB.begin(); Inst != BB.end();) {
+        if (auto *CB = dyn_cast<CallBase>(Inst)) {
+          if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) {
+            Inst = Inst->eraseFromParent();
+            continue;
+          }
+        }
+        ++Inst;
+      }
+      if (InsertWrappers) {
+        if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+          auto *WrapperCall =
+              CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
+          WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
         }
       }
-      ++Inst;
     }
     if (InsertWrappers) {
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-        auto *WrapperCall =
-            CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI);
-        WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-      }
+      KernelFunc->getEntryBlock().getFirstInsertionPt();
+      auto *WrapperCall =
+          CallInst::Create(WrapperFuncTy, StartWrapperFunc, "",
+                           &*KernelFunc->getEntryBlock().getFirstInsertionPt());
+      WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
     }
   }
-  if (InsertWrappers) {
-    KernelFunc->getEntryBlock().getFirstInsertionPt();
-    auto *WrapperCall =
-        CallInst::Create(WrapperFuncTy, StartWrapperFunc, "",
-                         &*KernelFunc->getEntryBlock().getFirstInsertionPt());
-    WrapperCall->setCallingConv(CallingConv::SPIR_FUNC);
-  }
-}
 
-void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
-                                              int BarrierFlags) const {
-  if (BarrierFlags == -1) {
-    return;
-  }
-  assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) &&
-         "Invalid barrier flags");
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override {
+    if (BarrierFlags == -1) {
+      return;
+    }
+    assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) &&
+           "Invalid barrier flags");
+
+    static const auto FnAttrs = AttributeSet::get(
+        LLVMMod->getContext(),
+        {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent),
+         Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)});
 
-  static const auto FnAttrs = AttributeSet::get(
-      LLVMMod->getContext(),
-      {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent),
-       Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)});
+    static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"};
 
-  static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"};
+    Function *F = LLVMMod->getFunction(N);
+    if (!F) {
+      constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage;
 
-  Function *F = LLVMMod->getFunction(N);
-  if (!F) {
-    constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage;
+      auto *Ty = FunctionType::get(
+          Builder.getVoidTy(),
+          {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
+          false /* isVarArg*/);
 
-    auto *Ty = FunctionType::get(
-        Builder.getVoidTy(),
-        {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
-        false /* isVarArg*/);
+      F = Function::Create(Ty, Linkage, N, *LLVMMod);
 
-    F = Function::Create(Ty, Linkage, N, *LLVMMod);
+      F->setAttributes(
+          AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
+      F->setCallingConv(CallingConv::SPIR_FUNC);
+    }
+
+    // See
+    // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
+    SmallVector<Value *> Args{
+        Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+        Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
+        Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) |
+                         ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))};
 
-    F->setAttributes(
+    auto *BarrierCallInst = Builder.CreateCall(F, Args);
+    BarrierCallInst->setAttributes(
         AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
-    F->setCallingConv(CallingConv::SPIR_FUNC);
+    BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC);
   }
 
-  // See
-  // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-
-  SmallVector<Value *> Args{
-      Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-      Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-      Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) |
-                       ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))};
-
-  auto *BarrierCallInst = Builder.CreateCall(F, Args);
-  BarrierCallInst->setAttributes(
-      AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {}));
-  BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC);
-}
+  // Corresponds to definition of spir_private and spir_local in
+  // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
 
-void SPIRVTargetFusionInfo::updateAddressSpaceMetadata(
-    Function *KernelFunc, ArrayRef<size_t> LocalSize,
-    unsigned AddressSpace) const {
-  static constexpr unsigned AddressSpaceBitWidth{32};
-  static constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"};
-
-  auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
-      IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth),
-      AddressSpace));
-  if (auto *AddrspaceMD = dyn_cast_or_null<MDNode>(
-          KernelFunc->getMetadata(KernelArgAddrSpaceMD))) {
-    // If we have kernel_arg_addr_space metadata in the original function,
-    // we should update it in the new one.
-    SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
-                                    AddrspaceMD->op_end()};
-    for (auto I : enumerate(LocalSize)) {
-      if (I.value() == 0) {
-        continue;
-      }
-      const auto Index = I.index();
-      if (const auto *PtrTy =
-              dyn_cast<PointerType>(KernelFunc->getArg(Index)->getType())) {
-        if (PtrTy->getAddressSpace() == getLocalAddressSpace()) {
-          NewInfo[Index] = NewAddrspace;
+  void updateAddressSpaceMetadata(Function *KernelFunc,
+                                  ArrayRef<size_t> LocalSize,
+                                  unsigned AddressSpace) const override {
+    static constexpr unsigned AddressSpaceBitWidth{32};
+    static constexpr StringLiteral KernelArgAddrSpaceMD{
+        "kernel_arg_addr_space"};
+
+    auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get(
+        IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth),
+        AddressSpace));
+    if (auto *AddrspaceMD = dyn_cast_or_null<MDNode>(
+            KernelFunc->getMetadata(KernelArgAddrSpaceMD))) {
+      // If we have kernel_arg_addr_space metadata in the original function,
+      // we should update it in the new one.
+      SmallVector<Metadata *> NewInfo{AddrspaceMD->op_begin(),
+                                      AddrspaceMD->op_end()};
+      for (auto I : enumerate(LocalSize)) {
+        if (I.value() == 0) {
+          continue;
+        }
+        const auto Index = I.index();
+        if (const auto *PtrTy =
+                dyn_cast<PointerType>(KernelFunc->getArg(Index)->getType())) {
+          if (PtrTy->getAddressSpace() == getLocalAddressSpace()) {
+            NewInfo[Index] = NewAddrspace;
+          }
         }
       }
+      KernelFunc->setMetadata(KernelArgAddrSpaceMD,
+                              MDNode::get(KernelFunc->getContext(), NewInfo));
     }
-    KernelFunc->setMetadata(KernelArgAddrSpaceMD,
-                            MDNode::get(KernelFunc->getContext(), NewInfo));
   }
-}
+};
 
 //
 // NVPTXTargetFusionInfo
 //
+class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
+public:
+  using TargetFusionInfoImpl::TargetFusionInfoImpl;
 
-void NVPTXTargetFusionInfo::notifyFunctionsDelete(
-    llvm::ArrayRef<Function *> Funcs) const {
-  SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
-  SmallVector<MDNode *> ValidKernels;
-  auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
-  for (auto *Op : OldAnnotations->operands()) {
-    if (auto *TOp = dyn_cast<MDTuple>(Op)) {
-      if (auto *COp = dyn_cast_if_present<ConstantAsMetadata>(
-              TOp->getOperand(0).get())) {
-        if (!DeletedFuncs.contains(COp->getValue())) {
-          ValidKernels.push_back(Op);
-          // Add to the set to also remove duplicate entries.
-          DeletedFuncs.insert(COp->getValue());
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
+    SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
+    SmallVector<MDNode *> ValidKernels;
+    auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
+    for (auto *Op : OldAnnotations->operands()) {
+      if (auto *TOp = dyn_cast<MDTuple>(Op)) {
+        if (auto *COp = dyn_cast_if_present<ConstantAsMetadata>(
+                TOp->getOperand(0).get())) {
+          if (!DeletedFuncs.contains(COp->getValue())) {
+            ValidKernels.push_back(Op);
+            // Add to the set to also remove duplicate entries.
+            DeletedFuncs.insert(COp->getValue());
+          }
         }
       }
     }
+    LLVMMod->eraseNamedMetadata(OldAnnotations);
+    auto *NewAnnotations =
+        LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+    for (auto *Kernel : ValidKernels) {
+      NewAnnotations->addOperand(Kernel);
+    }
+  }
+
+  void addKernelFunction(Function *KernelFunc) const override {
+    auto *NVVMAnnotations =
+        LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+    auto *MDOne = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
+    auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel");
+    auto *MDFunc = ConstantAsMetadata::get(KernelFunc);
+    SmallVector<Metadata *, 3> KernelMD({MDFunc, MDKernelString, MDOne});
+    auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD);
+    NVVMAnnotations->addOperand(Tuple);
+  }
+
+  ArrayRef<StringRef> getKernelMetadataKeys() const override {
+    // FIXME: Check whether we need to take care of sycl_fixed_targets.
+    static SmallVector<StringRef> Keys{{"kernel_arg_buffer_location",
+                                        "kernel_arg_runtime_aligned",
+                                        "kernel_arg_exclusive_ptr"}};
+    return Keys;
+  }
+
+  ArrayRef<StringRef> getUniformKernelAttributes() const override {
+    static SmallVector<StringRef> Keys{
+        {"target-cpu", "target-features", "uniform-work-group-size"}};
+    return Keys;
+  }
+
+  void createBarrierCall(IRBuilderBase &Builder,
+                         int BarrierFlags) const override {
+    if (BarrierFlags == -1) {
+      return;
+    }
+    // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX
+    // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0
+    // instruction, equivalent to the __syncthreads() call in CUDA."
+    Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {});
+  }
+
+  // Corresponds to the definitions in the LLVM NVPTX backend user guide:
+  // https://llvm.org/docs/NVPTXUsage.html#address-spaces
+  unsigned getPrivateAddressSpace() const override { return 0; }
+  unsigned getLocalAddressSpace() const override { return 3; }
+};
+
+//
+// TargetFusionInfo
+//
+
+TargetFusionInfo::TargetFusionInfo(llvm::Module *Mod) {
+  llvm::Triple Tri(Mod->getTargetTriple());
+  if (Tri.isNVPTX()) {
+    Impl = std::make_shared<NVPTXTargetFusionInfo>(Mod);
+    return;
   }
-  LLVMMod->eraseNamedMetadata(OldAnnotations);
-  auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
-  for (auto *Kernel : ValidKernels) {
-    NewAnnotations->addOperand(Kernel);
+  if (Tri.isSPIRV() || Tri.isSPIR()) {
+    Impl = std::make_shared<SPIRVTargetFusionInfo>(Mod);
+    return;
   }
+  llvm_unreachable("Unsupported target for fusion");
 }
 
-void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) const {
-  auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
-  auto *MDOne = ConstantAsMetadata::get(
-      ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
-  auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel");
-  auto *MDFunc = ConstantAsMetadata::get(KernelFunc);
-  SmallVector<Metadata *, 3> KernelMD({MDFunc, MDKernelString, MDOne});
-  auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD);
-  NVVMAnnotations->addOperand(Tuple);
+void TargetFusionInfo::notifyFunctionsDelete(
+    llvm::ArrayRef<Function *> Funcs) const {
+  Impl->notifyFunctionsDelete(Funcs);
 }
 
-ArrayRef<StringRef> NVPTXTargetFusionInfo::getKernelMetadataKeys() const {
-  // FIXME: Check whether we need to take care of sycl_fixed_targets.
-  static SmallVector<StringRef> Keys{{"kernel_arg_buffer_location",
-                                      "kernel_arg_runtime_aligned",
-                                      "kernel_arg_exclusive_ptr"}};
-  return Keys;
+void TargetFusionInfo::addKernelFunction(llvm::Function *KernelFunc) const {
+  Impl->addKernelFunction(KernelFunc);
 }
 
-void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
-                                              int BarrierFlags) const {
-  if (BarrierFlags == -1) {
-    return;
-  }
-  // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX
-  // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0
-  // instruction, equivalent to the __syncthreads() call in CUDA."
-  Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {});
+void TargetFusionInfo::postProcessKernel(Function *KernelFunc) const {
+  Impl->postProcessKernel(KernelFunc);
+}
+
+llvm::ArrayRef<llvm::StringRef>
+TargetFusionInfo::getKernelMetadataKeys() const {
+  return Impl->getKernelMetadataKeys();
+}
+
+void TargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
+                                         int BarrierFlags) const {
+  Impl->createBarrierCall(Builder, BarrierFlags);
+}
+
+unsigned TargetFusionInfo::getPrivateAddressSpace() const {
+  return Impl->getPrivateAddressSpace();
+}
+
+unsigned TargetFusionInfo::getLocalAddressSpace() const {
+  return Impl->getLocalAddressSpace();
+}
+
+void TargetFusionInfo::updateAddressSpaceMetadata(Function *KernelFunc,
+                                                  ArrayRef<size_t> LocalSize,
+                                                  unsigned AddressSpace) const {
+  Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace);
+}
+
+llvm::ArrayRef<llvm::StringRef>
+TargetFusionInfo::getUniformKernelAttributes() const {
+  return Impl->getUniformKernelAttributes();
 }
 
 //
@@ -265,3 +367,5 @@ void MetadataCollection::attachToFunction(llvm::Function *Func) {
     Func->setMetadata(Key, MDEntries);
   }
 }
+
+} // namespace llvm
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h
index c307de2dd764e..f88476c01ebc3 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.h
+++ b/sycl-fusion/passes/target/TargetFusionInfo.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
-#define SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
+#ifndef SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
+#define SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -15,90 +15,7 @@
 
 namespace llvm {
 
-class TargetFusionInfoImpl {
-
-public:
-  virtual ~TargetFusionInfoImpl() = default;
-
-  virtual void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const {
-    (void)Funcs;
-  }
-
-  virtual void addKernelFunction(Function *KernelFunc) const {
-    (void)KernelFunc;
-  }
-
-  virtual void postProcessKernel(Function *KernelFunc) const {
-    (void)KernelFunc;
-  }
-
-  virtual ArrayRef<StringRef> getKernelMetadataKeys() const { return {}; }
-
-  virtual void createBarrierCall(IRBuilderBase &Builder,
-                                 int BarrierFlags) const = 0;
-
-  virtual unsigned getPrivateAddressSpace() const = 0;
-
-  virtual unsigned getLocalAddressSpace() const = 0;
-
-  virtual void updateAddressSpaceMetadata(Function *KernelFunc,
-                                          ArrayRef<size_t> LocalSize,
-                                          unsigned AddressSpace) const {
-    (void)KernelFunc;
-    (void)LocalSize;
-  }
-
-protected:
-  explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {};
-
-  llvm::Module *LLVMMod;
-
-  friend class TargetFusionInfo;
-};
-
-class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
-public:
-  void addKernelFunction(Function *KernelFunc) const override;
-
-  ArrayRef<StringRef> getKernelMetadataKeys() const override;
-
-  void postProcessKernel(Function *KernelFunc) const override;
-
-  void createBarrierCall(IRBuilderBase &Builder,
-                         int BarrierFlags) const override;
-
-  // Corresponds to definition of spir_private and spir_local in
-  // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap".
-  unsigned getPrivateAddressSpace() const override { return 0; }
-  unsigned getLocalAddressSpace() const override { return 3; }
-
-  void updateAddressSpaceMetadata(Function *KernelFunc,
-                                  ArrayRef<size_t> LocalSize,
-                                  unsigned AddressSpace) const override;
-
-private:
-  using TargetFusionInfoImpl::TargetFusionInfoImpl;
-};
-
-class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
-public:
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override;
-
-  void addKernelFunction(Function *KernelFunc) const override;
-
-  ArrayRef<StringRef> getKernelMetadataKeys() const override;
-
-  void createBarrierCall(IRBuilderBase &Builder,
-                         int BarrierFlags) const override;
-
-  // Corresponds to the definitions in the LLVM NVPTX backend user guide:
-  // https://llvm.org/docs/NVPTXUsage.html#address-spaces
-  unsigned getPrivateAddressSpace() const override { return 0; }
-  unsigned getLocalAddressSpace() const override { return 3; }
-
-private:
-  using TargetFusionInfoImpl::TargetFusionInfoImpl;
-};
+class TargetFusionInfoImpl;
 
 ///
 /// Common interface to target-specific logic around handling of kernel
@@ -108,59 +25,49 @@ class TargetFusionInfo {
   ///
   /// Create the correct target-specific implementation based on the target
   /// triple of \p Module.
-  static TargetFusionInfo getTargetFusionInfo(llvm::Module *Module);
+  explicit TargetFusionInfo(llvm::Module *Module);
 
   ///
   /// Notify the target-specific implementation that set of functions \p Funcs
   /// is about to be erased from the module. This should be called BEFORE
   /// erasing the functions.
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const {
-    Impl->notifyFunctionsDelete(Funcs);
-  }
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const;
 
   ///
   /// Notify the target-specific implementation that the function \p KernelFunc
   /// was added as a new kernel. This should be called AFTER the function has
   /// been added.
-  void addKernelFunction(llvm::Function *KernelFunc) const {
-    Impl->addKernelFunction(KernelFunc);
-  }
+  void addKernelFunction(llvm::Function *KernelFunc) const;
 
   ///
   /// Target-specific post-processing of the new kernel function \p KernelFunc.
   /// This should be called AFTER the function has been added and defined.
-  void postProcessKernel(Function *KernelFunc) const {
-    Impl->postProcessKernel(KernelFunc);
-  }
+  void postProcessKernel(Function *KernelFunc) const;
 
   ///
   /// Get the target-specific list of argument metadata attached to each
   /// function that should be collected and attached to the fused kernel.
-  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() const {
-    return Impl->getKernelMetadataKeys();
-  }
+  llvm::ArrayRef<llvm::StringRef> getKernelMetadataKeys() const;
+
+  ///
+  /// Get the target-specific list of kernel function attributes that are
+  /// uniform across all input kernels and should be attached to the fused
+  /// kernel.
+  llvm::ArrayRef<llvm::StringRef> getUniformKernelAttributes() const;
 
-  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const {
-    Impl->createBarrierCall(Builder, BarrierFlags);
-  }
+  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const;
 
-  unsigned getPrivateAddressSpace() const {
-    return Impl->getPrivateAddressSpace();
-  }
+  unsigned getPrivateAddressSpace() const;
 
-  unsigned getLocalAddressSpace() const { return Impl->getLocalAddressSpace(); }
+  unsigned getLocalAddressSpace() const;
 
   void updateAddressSpaceMetadata(Function *KernelFunc,
                                   ArrayRef<size_t> LocalSize,
-                                  unsigned AddressSpace) const {
-    Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace);
-  }
+                                  unsigned AddressSpace) const;
 
 private:
   using ImplPtr = std::shared_ptr<TargetFusionInfoImpl>;
 
-  TargetFusionInfo(ImplPtr &&I) : Impl{I} {}
-
   ImplPtr Impl;
 };
 
@@ -183,4 +90,4 @@ class MetadataCollection {
 };
 } // namespace llvm
 
-#endif // SYCL_FUSION_PASSES_TARGETFUSIONINFO_H
+#endif // SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index efb105d1ba090..1a34178a4a3c2 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -105,18 +105,10 @@ affinityDomainToString(info::partition_affinity_domain AffinityDomain) {
 }
 
 // Mapping expected SYCL return types to those returned by PI calls
-template <typename T> struct sycl_to_pi {
-  using type = T;
-};
-template <> struct sycl_to_pi<bool> {
-  using type = pi_bool;
-};
-template <> struct sycl_to_pi<device> {
-  using type = RT::PiDevice;
-};
-template <> struct sycl_to_pi<platform> {
-  using type = RT::PiPlatform;
-};
+template <typename T> struct sycl_to_pi { using type = T; };
+template <> struct sycl_to_pi<bool> { using type = pi_bool; };
+template <> struct sycl_to_pi<device> { using type = RT::PiDevice; };
+template <> struct sycl_to_pi<platform> { using type = RT::PiPlatform; };
 
 // Mapping fp_config device info types to the values used to check fp support
 template <typename Param> struct check_fp_support {};
@@ -771,7 +763,8 @@ struct get_device_info_impl<
     // Currently fusion is only supported for SPIR-V based backends, i.e. OpenCL
     // and LevelZero.
     return (Dev->getBackend() == backend::ext_oneapi_level_zero) ||
-           (Dev->getBackend() == backend::opencl);
+           (Dev->getBackend() == backend::opencl) ||
+           (Dev->getBackend() == backend::ext_oneapi_cuda);
 #else  // SYCL_EXT_CODEPLAY_KERNEL_FUSION
     (void)Dev;
     return false;
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index ffcdca25584f7..ae20134d46b8c 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -62,21 +62,16 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
                   backend::ext_oneapi_cuda;
   if (isNvidia) {
     auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
-    std::vector<kernel_id> KernelIds;
-    KernelIds.push_back(KernelID);
+    std::vector<kernel_id> KernelIds{KernelID};
     auto DeviceImages =
         ProgramManager::getInstance().getRawDeviceImages(KernelIds);
-    const RTDeviceBinaryImage *DeviceImage = nullptr;
-    for (auto *DI : DeviceImages) {
-      // We are looking for a device image with LLVM IR format and target spec
-      // "llvm_nvptx64", which has been set by the offload-wrapper action.
-      if (DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
-          DI->getRawData().DeviceTargetSpec == std::string("llvm_nvptx64")) {
-        DeviceImage = DI;
-        break;
-      }
-    }
-    if (!DeviceImage) {
+    auto DeviceImage = std::find_if(
+        DeviceImages.begin(), DeviceImages.end(), [](RTDeviceBinaryImage *DI) {
+          return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
+                 DI->getRawData().DeviceTargetSpec ==
+                     std::string("llvm_nvptx64");
+        });
+    if (DeviceImage == DeviceImages.end()) {
       return {nullptr, nullptr};
     }
     auto ContextImpl = Queue->getContextImplPtr();
@@ -84,9 +79,9 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
     auto DeviceImpl = Queue->getDeviceImplPtr();
     auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
     RT::PiProgram Program =
-        detail::ProgramManager::getInstance().createPIProgram(*DeviceImage,
+        detail::ProgramManager::getInstance().createPIProgram(**DeviceImage,
                                                               Context, Device);
-    return {DeviceImage, Program};
+    return {*DeviceImage, Program};
   }
 
   const RTDeviceBinaryImage *DeviceImage = nullptr;
@@ -460,6 +455,11 @@ static ParamIterator preProcessArguments(
     if (Arg->Arg.MPtr) {
       Arg->Arg.MPtr =
           storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+      // Propagate values of scalar parameters as constants to the JIT
+      // compiler.
+      JITConstants.emplace_back(
+          ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
+          Arg->Arg.MPtr, Arg->Arg.MSize);
     }
     // Standard layout arguments do not participate in identical argument
     // detection, but we still add it to the list here. As the SYCL runtime can
@@ -468,16 +468,10 @@ static ParamIterator preProcessArguments(
     // not be materialized by the JIT compiler. Instead of removing some
     // standard layout arguments due to identity and missing some in case the
     // materialization is not possible, we rely on constant propagation to
-    // replace standard layout arguments by constants (see below).
+    // replace standard layout arguments by constants.
     NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
                                     true);
-    // Propagate values of scalar parameters as constants to the JIT
-    // compiler.
-    JITConstants.emplace_back(
-        ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
-        Arg->Arg.MPtr, Arg->Arg.MSize);
     return ++Arg;
-    
   }
   // First check if there's already another parameter with identical
   // value.
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index a019ab9378226..641345aa74353 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -289,7 +289,7 @@ class Command {
   // XPTI instrumentation. Copy code location details to the internal struct.
   // Memory is allocated in this method and released in destructor.
   void copySubmissionCodeLocation();
-  
+
   /// Clear all dependency events for device and host dependencies. This should
   /// only be used if a command is about to be deleted without being executed
   /// before that.

From 4ba8e44e1ec2feaa508c000553af20ece976477e Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 22 Mar 2023 16:40:10 +0000
Subject: [PATCH 16/25] [SYCL][Fusion] Update linkage graph diagram

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 clang/lib/Driver/Driver.cpp | 106 ++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index a7b15f4d8a9e5..6cb49ba5a0050 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5517,6 +5517,8 @@ class OffloadingActionBuilder final {
         //   s - device code split requested
         //   r - relocatable device code is requested
         //   f - link object output type is TY_Tempfilelist (fat archive)
+        //   e - Embedded IR for fusion (-fsycl-embed-ir) was requested
+        //       and target is NVPTX.
         //   * - "all other cases"
         //     - no condition means output/input is "always" present
         // First symbol indicates output/input type
@@ -5536,58 +5538,58 @@ class OffloadingActionBuilder final {
         //                |             |
         //                |             |
         //         .---------------------------------------.
-        //         |               PostLink                |
-        //         .---------------------------------------.
-        //                           [+*]                [+]
-        //                             |                  |
-        //                             |                  |
-        //                             |---------         |
-        //                             |        |         |
-        //                             |        |         |
-        //                             |      [+!rf]      |
-        //                             |  .-------------. |
-        //                             |  | llvm-foreach| |
-        //                             |  .-------------. |
-        //                             |        |         |
-        //                            [+*]    [+!rf]      |
-        //                      .-----------------.       |
-        //                      | FileTableTform  |       |
-        //                      | (extract "Code")|       |
-        //                      .-----------------.       |
-        //                              [-]               |-----------
-        //           --------------------|                           |
-        //           |                   |                           |
-        //           |                   |-----------------          |
-        //           |                   |                |          |
-        //           |                   |               [-!rf]      |
-        //           |                   |         .--------------.  |
-        //           |                   |         |FileTableTform|  |
-        //           |                   |         |   (merge)    |  |
-        //           |                   |         .--------------.  |
-        //           |                   |               [-]         |-------
-        //           |                   |                |          |      |
-        //           |                   |                |    ------|      |
-        //           |                   |        --------|    |            |
-        //          [.]                 [-*]   [-!rf]        [+!rf]         |
-        //   .---------------.  .-------------------. .--------------.      |
-        //   | finalizeNVPTX  | |  SPIRVTranslator  | |FileTableTform|      |
-        //   | finalizeAMDGCN | |                   | |   (merge)    |      |
-        //   .---------------.  .-------------------. . -------------.      |
-        //          [.]             [-as]      [-!a]         |              |
-        //           |                |          |           |              |
-        //           |              [-s]         |           |              |
-        //           |       .----------------.  |           |              |
-        //           |       | BackendCompile |  |           |              |
-        //           |       .----------------.  |     ------|              |
-        //           |              [-s]         |     |                    |
-        //           |                |          |     |                    |
-        //           |              [-a]      [-!a]  [-!rf]                 |
-        //           |              .--------------------.                  |
-        //           -----------[-n]|   FileTableTform   |[+*]--------------|
-        //                          |  (replace "Code")  |
-        //                          .--------------------.
-        //                                      |
-        //                                    [+*]
+        //         |               PostLink                |[+e]----------------
+        //         .---------------------------------------.                   |
+        //                           [+*]                [+]                   |
+        //                             |                  |                    |
+        //                             |                  |                    |
+        //                             |---------         |                    |
+        //                             |        |         |                    |
+        //                             |        |         |                    |
+        //                             |      [+!rf]      |                    |
+        //                             |  .-------------. |                    |
+        //                             |  | llvm-foreach| |                    |
+        //                             |  .-------------. |                    |
+        //                             |        |         |                    |
+        //                            [+*]    [+!rf]      |                    |
+        //                      .-----------------.       |                    |
+        //                      | FileTableTform  |       |                    |
+        //                      | (extract "Code")|       |                    |
+        //                      .-----------------.       |                    |
+        //                              [-]               |-----------         |
+        //           --------------------|                           |         |
+        //           |                   |                           |         |
+        //           |                   |-----------------          |         |
+        //           |                   |                |          |         |
+        //           |                   |               [-!rf]      |         |
+        //           |                   |         .--------------.  |         |
+        //           |                   |         |FileTableTform|  |         |
+        //           |                   |         |   (merge)    |  |         |
+        //           |                   |         .--------------.  |         |
+        //           |                   |               [-]         |-------  |
+        //           |                   |                |          |      |  |
+        //           |                   |                |    ------|      |  |
+        //           |                   |        --------|    |            |  |
+        //          [.]                 [-*]   [-!rf]        [+!rf]         |  |
+        //   .---------------.  .-------------------. .--------------.      |  |
+        //   | finalizeNVPTX  | |  SPIRVTranslator  | |FileTableTform|      |  |
+        //   | finalizeAMDGCN | |                   | |   (merge)    |      |  |
+        //   .---------------.  .-------------------. . -------------.      |  |
+        //          [.]             [-as]      [-!a]         |              |  |
+        //           |                |          |           |              |  |
+        //           |              [-s]         |           |              |  |
+        //           |       .----------------.  |           |              |  |
+        //           |       | BackendCompile |  |           |              |  |
+        //           |       .----------------.  |     ------|              |  |
+        //           |              [-s]         |     |                    |  |
+        //           |                |          |     |                    |  |
+        //           |              [-a]      [-!a]  [-!rf]                 |  |
+        //           |              .--------------------.                  |  |
+        //           -----------[-n]|   FileTableTform   |[+*]--------------|  |
+        //                          |  (replace "Code")  |                     |
+        //                          .--------------------.                     |
+        //                                      |      -------------------------
+        //                                    [+*]     | [+e]
         //         .--------------------------------------.
         //         |            OffloadWrapper            |
         //         .--------------------------------------.

From 8fcd4c7224f33dde24d9314fa12bf2f5f3126034 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 28 Mar 2023 13:19:51 +0100
Subject: [PATCH 17/25] Don't compile NVPTX-specifics if not supported

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 .../jit-compiler/lib/translation/KernelTranslation.cpp    | 2 +-
 sycl-fusion/passes/CMakeLists.txt                         | 8 ++++++++
 sycl-fusion/passes/target/TargetFusionInfo.cpp            | 4 ++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 8b1f3d913026d..395cd64f0f913 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -227,7 +227,6 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
   LLVMInitializeNVPTXTarget();
   LLVMInitializeNVPTXAsmPrinter();
   LLVMInitializeNVPTXTargetMC();
-#endif // FUSION_JIT_SUPPORT_PTX
 
   static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
   static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
@@ -283,4 +282,5 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
   }
 
   return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX);
+#endif // FUSION_JIT_SUPPORT_PTX
 }
diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index 95f504091dc30..4693083be4faa 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -26,6 +26,10 @@ target_link_libraries(SYCLKernelFusion
   sycl-fusion-common
 )
 
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_PTX)
+endif()
+
 # Static library for linking with the jit_compiler
 add_llvm_library(SYCLKernelFusionPasses
   SYCLFusionPasses.cpp
@@ -60,3 +64,7 @@ target_link_libraries(SYCLKernelFusionPasses
   PRIVATE
   sycl-fusion-common
 )
+
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_PTX)
+endif()
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
index 27514bbb80269..2e534c35037fd 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.cpp
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -209,6 +209,7 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
 //
 // NVPTXTargetFusionInfo
 //
+#ifdef FUSION_JIT_SUPPORT_PTX
 class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
 public:
   using TargetFusionInfoImpl::TargetFusionInfoImpl;
@@ -279,6 +280,7 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
   unsigned getPrivateAddressSpace() const override { return 0; }
   unsigned getLocalAddressSpace() const override { return 3; }
 };
+#endif // FUSION_JIT_SUPPORT_PTX
 
 //
 // TargetFusionInfo
@@ -286,10 +288,12 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
 
 TargetFusionInfo::TargetFusionInfo(llvm::Module *Mod) {
   llvm::Triple Tri(Mod->getTargetTriple());
+#ifdef FUSION_JIT_SUPPORT_PTX
   if (Tri.isNVPTX()) {
     Impl = std::make_shared<NVPTXTargetFusionInfo>(Mod);
     return;
   }
+#endif // FUSION_JIT_SUPPORT_PTX
   if (Tri.isSPIRV() || Tri.isSPIR()) {
     Impl = std::make_shared<SPIRVTargetFusionInfo>(Mod);
     return;

From 75a77fd90cc1044f70ca3eb0405d99b82dbd56fc Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 28 Mar 2023 15:30:53 +0100
Subject: [PATCH 18/25] Migrate test changes from intel/llvm-test-suite

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/test-e2e/KernelFusion/abort_fusion.cpp   |   4 +-
 .../KernelFusion/abort_internalization.cpp    |  10 +-
 .../abort_internalization_stored_ptr.cpp      |   4 +-
 .../barrier_local_internalization.cpp         |   4 +-
 .../KernelFusion/buffer_internalization.cpp   |   4 +-
 sycl/test-e2e/KernelFusion/cancel_fusion.cpp  |   4 +-
 .../test-e2e/KernelFusion/complete_fusion.cpp |   4 +-
 .../KernelFusion/device_info_descriptor.cpp   |   2 +-
 sycl/test-e2e/KernelFusion/diamond_shape.cpp  |   4 +-
 .../KernelFusion/diamond_shape_local.cpp      | 111 ++++++++++++++++++
 .../KernelFusion/event_wait_cancel.cpp        |   4 +-
 .../KernelFusion/event_wait_complete.cpp      |   4 +-
 .../KernelFusion/existing_local_accessor.cpp  |  78 ++++++++++++
 .../internal_explicit_dependency.cpp          |   4 +-
 .../internalize_array_wrapper.cpp             |   4 +-
 .../KernelFusion/internalize_deep.cpp         |   4 +-
 .../KernelFusion/internalize_multi_ptr.cpp    |   4 +-
 .../test-e2e/KernelFusion/internalize_vec.cpp |   4 +-
 .../KernelFusion/internalize_vfunc.cpp        |   4 +-
 sycl/test-e2e/KernelFusion/jit_caching.cpp    |   4 +-
 .../KernelFusion/local_internalization.cpp    |   4 +-
 .../KernelFusion/non_unit_local_size.cpp      |   4 +-
 .../KernelFusion/pointer_arg_function.cpp     |   4 +-
 .../KernelFusion/private_internalization.cpp  |   4 +-
 .../KernelFusion/ranged_offset_accessor.cpp   |   4 +-
 .../KernelFusion/struct_with_array.cpp        |   4 +-
 .../test-e2e/KernelFusion/sync_acc_mem_op.cpp |   4 +-
 .../KernelFusion/sync_buffer_destruction.cpp  |   4 +-
 .../test-e2e/KernelFusion/sync_event_wait.cpp |   4 +-
 .../KernelFusion/sync_host_accessor.cpp       |   4 +-
 sycl/test-e2e/KernelFusion/sync_host_task.cpp |   4 +-
 .../KernelFusion/sync_queue_destruction.cpp   |   4 +-
 .../test-e2e/KernelFusion/sync_queue_wait.cpp |   4 +-
 .../KernelFusion/sync_second_queue.cpp        |   4 +-
 .../sync_two_queues_event_dep.cpp             |   2 +-
 .../sync_two_queues_requirement.cpp           |   4 +-
 .../test-e2e/KernelFusion/sync_usm_mem_op.cpp |   8 +-
 .../KernelFusion/three_dimensional.cpp        |   4 +-
 .../test-e2e/KernelFusion/two_dimensional.cpp |   4 +-
 .../KernelFusion/usm_no_dependencies.cpp      |   4 +-
 .../KernelFusion/work_group_barrier.cpp       |   4 +-
 sycl/test-e2e/KernelFusion/wrapped_usm.cpp    |   4 +-
 sycl/test-e2e/README.md                       |   1 +
 43 files changed, 276 insertions(+), 80 deletions(-)
 create mode 100644 sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
 create mode 100644 sycl/test-e2e/KernelFusion/existing_local_accessor.cpp

diff --git a/sycl/test-e2e/KernelFusion/abort_fusion.cpp b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
index 28f5e0d72b5b5..e0a5858b12250 100644
--- a/sycl/test-e2e/KernelFusion/abort_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test fusion being aborted: Different scenarios causing the JIT compiler
diff --git a/sycl/test-e2e/KernelFusion/abort_internalization.cpp b/sycl/test-e2e/KernelFusion/abort_internalization.cpp
index b314bf3a0d64d..fdb7b3fa6b193 100644
--- a/sycl/test-e2e/KernelFusion/abort_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_internalization.cpp
@@ -1,9 +1,11 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
-// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -O2 -fsycl-embed-ir %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\
+// RUN: %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
-// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\
+// RUN: %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test incomplete internalization: Different scenarios causing the JIT compiler
diff --git a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
index c3d49cea3c1ab..d05d53e19c68c 100644
--- a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized"
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized"
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test pointers being stored are not internalized.
diff --git a/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp b/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp
index 8ff486812b921..3dee82d36006d 100644
--- a/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with local internalization and a combination of kernels
diff --git a/sycl/test-e2e/KernelFusion/buffer_internalization.cpp b/sycl/test-e2e/KernelFusion/buffer_internalization.cpp
index 102441ed1b8c8..4be40a9389a2f 100644
--- a/sycl/test-e2e/KernelFusion/buffer_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/buffer_internalization.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/cancel_fusion.cpp b/sycl/test-e2e/KernelFusion/cancel_fusion.cpp
index 6c94f9902579a..9dc5ebe2c007c 100644
--- a/sycl/test-e2e/KernelFusion/cancel_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/cancel_fusion.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test cancel fusion
diff --git a/sycl/test-e2e/KernelFusion/complete_fusion.cpp b/sycl/test-e2e/KernelFusion/complete_fusion.cpp
index 0ffeca17a5e78..67c2fb3d05ec9 100644
--- a/sycl/test-e2e/KernelFusion/complete_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/complete_fusion.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion without any internalization
diff --git a/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp b/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp
index 91bd1622a5a5f..c7dc498ce2dd7 100644
--- a/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp
+++ b/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// XFAIL: cuda || hip
+// XFAIL: hip
 // REQUIRES: fusion
 
 // Test correct return from device information descriptor.
diff --git a/sycl/test-e2e/KernelFusion/diamond_shape.cpp b/sycl/test-e2e/KernelFusion/diamond_shape.cpp
index 54a513fbfc71a..73ce385e1c848 100644
--- a/sycl/test-e2e/KernelFusion/diamond_shape.cpp
+++ b/sycl/test-e2e/KernelFusion/diamond_shape.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp b/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
new file mode 100644
index 0000000000000..359c86086e8d7
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
@@ -0,0 +1,111 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization specified on the
+// accessors for a combination of four kernels, forming a diamond-like shape and
+// repeating one of the kernels.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+struct AddKernel {
+  accessor<int, 1> accIn1;
+  accessor<int, 1> accIn2;
+  accessor<int, 1> accOut;
+
+  void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
+};
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp1[dataSize],
+      tmp2[dataSize], tmp3[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp1[i] = -1;
+    tmp2[i] = -1;
+    tmp3[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp1{
+        tmp1,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bTmp2{
+        tmp2,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bTmp3{
+        tmp3,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp1 = bTmp1.get_access(cgh);
+      cgh.parallel_for<AddKernel>(nd_range<1>{{dataSize}, {16}},
+                                  AddKernel{accIn1, accIn2, accTmp1});
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp1 = bTmp1.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accTmp2 = bTmp2.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp1 = bTmp1.get_access(cgh);
+      auto accTmp3 = bTmp3.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp2 = bTmp2.get_access(cgh);
+      auto accTmp3 = bTmp3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<AddKernel>(nd_range<1>{{dataSize}, {16}},
+                                  AddKernel{accTmp2, accTmp3, accOut});
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i + i * 25) && "Computation error");
+    assert(tmp1[i] == -1 && "tmp1 not internalized");
+    assert(tmp2[i] == -1 && "tmp2 not internalized");
+    assert(tmp3[i] == -1 && "tmp3 not internalized");
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp b/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp
index 1f4771d7d5b0d..aff44212531d8 100644
--- a/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp
+++ b/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test validity of events after cancel_fusion.
diff --git a/sycl/test-e2e/KernelFusion/event_wait_complete.cpp b/sycl/test-e2e/KernelFusion/event_wait_complete.cpp
index cf48a9c1ced43..20e547a09f64e 100644
--- a/sycl/test-e2e/KernelFusion/event_wait_complete.cpp
+++ b/sycl/test-e2e/KernelFusion/event_wait_complete.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test validity of events after complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp b/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp
new file mode 100644
index 0000000000000..a35473cc1fd51
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp
@@ -0,0 +1,78 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization and an local accessor that
+// already exists in one of the input kernels.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      local_accessor<int> accLocal{16, cgh};
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {16}}, [=](nd_item<1> i) {
+            size_t globalIdx = i.get_global_linear_id();
+            size_t localIdx = i.get_local_linear_id();
+            accLocal[localIdx] = accIn2[globalIdx];
+            accTmp[globalIdx] = accIn1[globalIdx] + accLocal[localIdx];
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp b/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp
index 22e3f5ba2b34d..3277ba5ee6f52 100644
--- a/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp
+++ b/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test complete fusion where one kernel in the fusion list specifies an
diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
index ba155e0e5ffac..2ebccb626ef23 100644
--- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test internalization of a nested array type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_deep.cpp b/sycl/test-e2e/KernelFusion/internalize_deep.cpp
index 4a0c32e2683b6..2585728259499 100644
--- a/sycl/test-e2e/KernelFusion/internalize_deep.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_deep.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with internalization of a deep struct type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp b/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp
index 9455cad86900a..bd521155be6ed 100644
--- a/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/internalize_vec.cpp b/sycl/test-e2e/KernelFusion/internalize_vec.cpp
index 765036ec98d9c..0536a8f15216a 100644
--- a/sycl/test-e2e/KernelFusion/internalize_vec.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_vec.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with internalization of a struct type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
index bf8f177a24904..1a130a404ad71 100644
--- a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/jit_caching.cpp b/sycl/test-e2e/KernelFusion/jit_caching.cpp
index 617b81bb4b48c..7c00a7cc61eee 100644
--- a/sycl/test-e2e/KernelFusion/jit_caching.cpp
+++ b/sycl/test-e2e/KernelFusion/jit_caching.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test caching for JIT fused kernels. Also test for debug messages being
diff --git a/sycl/test-e2e/KernelFusion/local_internalization.cpp b/sycl/test-e2e/KernelFusion/local_internalization.cpp
index b60bd29df394c..508ad08584f4a 100644
--- a/sycl/test-e2e/KernelFusion/local_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/local_internalization.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with local internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp b/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
index 917eda6e090c7..ffd08f918d414 100644
--- a/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
+++ b/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with local internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp b/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp
index ffe5178cdaa9c..7c16d212d8485 100644
--- a/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp
+++ b/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 // This test currently fails because InferAddressSpace is not able to remove all
 // address-space casts, causing internalization to fail.
diff --git a/sycl/test-e2e/KernelFusion/private_internalization.cpp b/sycl/test-e2e/KernelFusion/private_internalization.cpp
index bf7490e5cabab..ca0e8fdeb2f97 100644
--- a/sycl/test-e2e/KernelFusion/private_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/private_internalization.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp b/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp
index 7325b26925349..bd2d2d0ea2a40 100644
--- a/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp
+++ b/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization on accessors with different
diff --git a/sycl/test-e2e/KernelFusion/struct_with_array.cpp b/sycl/test-e2e/KernelFusion/struct_with_array.cpp
index 20d25a0d38f50..f79b51908bb49 100644
--- a/sycl/test-e2e/KernelFusion/struct_with_array.cpp
+++ b/sycl/test-e2e/KernelFusion/struct_with_array.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization on a kernel functor with an
diff --git a/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp
index 14643a3d81179..440de656d043b 100644
--- a/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on an explicit memory operation on an accessor
 // happening before complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp b/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp
index 627a8cdbfe43d..96375f18c41d2 100644
--- a/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on buffer destruction happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_event_wait.cpp b/sycl/test-e2e/KernelFusion/sync_event_wait.cpp
index d34393638e9b4..d077116412683 100644
--- a/sycl/test-e2e/KernelFusion/sync_event_wait.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_event_wait.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on event::wait() happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp b/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp
index 854803f34716d..d6f4cdc77456f 100644
--- a/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on host accessor creation happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_host_task.cpp b/sycl/test-e2e/KernelFusion/sync_host_task.cpp
index fc94fa9b3d9dc..4c2bc870e2919 100644
--- a/sycl/test-e2e/KernelFusion/sync_host_task.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_host_task.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on host task submission happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp b/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp
index 145fde97b5012..936b486c7741b 100644
--- a/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on queue destruction happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp b/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp
index 5fe768d60c551..71996ffed8cf8 100644
--- a/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on queue::wait() happening before
 // complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_second_queue.cpp b/sycl/test-e2e/KernelFusion/sync_second_queue.cpp
index 057c96935361b..5147a639196c0 100644
--- a/sycl/test-e2e/KernelFusion/sync_second_queue.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_second_queue.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on submission of kernel with requirements to a
 // different queue happening before complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
index bb33fcdcb8337..a2c9caa88cabc 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
@@ -3,7 +3,7 @@
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // For this test, complete_fusion must be supported.
 // REQUIRES: fusion
 
diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp
index 3ca9015c7ee22..d3526f2aba029 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // For this test, complete_fusion must be supported.
 // REQUIRES: fusion
 
diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
index 67af367316fa7..270645af15b40 100644
--- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 // Test fusion cancellation on an explicit memory operation on an USM pointer
 // happening before complete_fusion.
@@ -61,6 +61,10 @@ int main() {
 
   fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
 
+  for (size_t i = 0; i < dataSize; ++i) {
+    std::cout << out[i] << ", ";
+  }
+  std::cout << "\n";
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
     assert(out[i] == (20 * i * i) && "Computation error");
diff --git a/sycl/test-e2e/KernelFusion/three_dimensional.cpp b/sycl/test-e2e/KernelFusion/three_dimensional.cpp
index db0ea25ca3077..805f7f88d782d 100644
--- a/sycl/test-e2e/KernelFusion/three_dimensional.cpp
+++ b/sycl/test-e2e/KernelFusion/three_dimensional.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/two_dimensional.cpp b/sycl/test-e2e/KernelFusion/two_dimensional.cpp
index 2eafb1c1ccc0a..30359b4713dab 100644
--- a/sycl/test-e2e/KernelFusion/two_dimensional.cpp
+++ b/sycl/test-e2e/KernelFusion/two_dimensional.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp b/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp
index 13290d06e25f1..85a19c4122202 100644
--- a/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp
+++ b/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test complete fusion using USM pointers.
diff --git a/sycl/test-e2e/KernelFusion/work_group_barrier.cpp b/sycl/test-e2e/KernelFusion/work_group_barrier.cpp
index 7141c37be8987..5d7e1a6f16806 100644
--- a/sycl/test-e2e/KernelFusion/work_group_barrier.cpp
+++ b/sycl/test-e2e/KernelFusion/work_group_barrier.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with a combination of kernels that require a work-group
diff --git a/sycl/test-e2e/KernelFusion/wrapped_usm.cpp b/sycl/test-e2e/KernelFusion/wrapped_usm.cpp
index aa112ca064ec0..a46c9199b31de 100644
--- a/sycl/test-e2e/KernelFusion/wrapped_usm.cpp
+++ b/sycl/test-e2e/KernelFusion/wrapped_usm.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion, aspect-usm_shared_allocations
 
 // Test complete fusion using an wrapped USM pointer as kernel functor argument.
diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md
index b8dfb34406788..1a751973eaea3 100644
--- a/sycl/test-e2e/README.md
+++ b/sycl/test-e2e/README.md
@@ -183,6 +183,7 @@ unavailable.
  * **dump_ir**: - compiler can / cannot dump IR;
  * **llvm-spirv** - llvm-spirv tool availability;
  * **llvm-link** - llvm-link tool availability;
+ * **fusion**: - Runtime supports kernel fusion;
 
 ## llvm-lit parameters
 

From a8afe1dac295c1afa135f513dee08364f85c0075 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 5 Apr 2023 15:34:43 +0100
Subject: [PATCH 19/25] Address more PR feedback

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 .../jit-compiler/lib/translation/KernelTranslation.cpp        | 3 +++
 sycl-fusion/passes/target/TargetFusionInfo.cpp                | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 395cd64f0f913..e1435870eb3b2 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -220,6 +220,9 @@ llvm::Expected<KernelBinary *>
 KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
                                  JITContext &JITCtx) {
 #ifndef FUSION_JIT_SUPPORT_PTX
+  (void)KernelInfo;
+  (void)Mod;
+  (void)JITCtx;
   return createStringError(inconvertibleErrorCode(),
                            "PTX translation not supported in this build");
 #else  // FUSION_JIT_SUPPORT_PTX
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
index 2e534c35037fd..e6e15a07e28e8 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.cpp
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -49,6 +49,8 @@ class TargetFusionInfoImpl {
   llvm::Module *LLVMMod;
 };
 
+namespace {
+
 //
 // SPIRVTargetFusionInfo
 //
@@ -282,6 +284,8 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
 };
 #endif // FUSION_JIT_SUPPORT_PTX
 
+} // anonymous namespace
+
 //
 // TargetFusionInfo
 //

From f7df423d473131d18e38b74a7ab7b71608b492dc Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 5 Apr 2023 15:34:59 +0100
Subject: [PATCH 20/25] Add test for kernel fusion with math function

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/test-e2e/KernelFusion/math_function.cpp | 64 ++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 sycl/test-e2e/KernelFusion/math_function.cpp

diff --git a/sycl/test-e2e/KernelFusion/math_function.cpp b/sycl/test-e2e/KernelFusion/math_function.cpp
new file mode 100644
index 0000000000000..f2dc1541f69cb
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/math_function.cpp
@@ -0,0 +1,64 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// REQUIRES: fusion
+
+// Test fusion of a kernel using a math function. 
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  float in1[dataSize], in2[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = 1;
+    in2[i] = i * 3;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<float> bIn1{in1, range{dataSize}};
+    buffer<float> bIn2{in2, range{dataSize}};
+    buffer<float> bTmp{tmp, range{dataSize}};
+    buffer<float> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = sycl::cospi(accIn1[i]); });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn2[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (-1.0 * static_cast<float>(i*3)) && "Computation error");
+  }
+
+  return 0;
+}

From bc32fad74fd81d753be7c29f55a303e5ec080fbe Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 5 Apr 2023 15:35:26 +0100
Subject: [PATCH 21/25] Document CUDA kernel fusion in design documentation

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/doc/design/CompilerAndRuntimeDesign.md   |  23 ++
 .../doc/design/images/DevicePTXProcessing.svg | 228 ++++++++----------
 2 files changed, 124 insertions(+), 127 deletions(-)

diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index 3db49acbd36b1..f06b0d52c257d 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -758,6 +758,29 @@ entry:
 
 Note: Kernel naming is not fully stable for now.
 
+##### Kernel Fusion Support
+
+The [experimental kernel fusion
+extension](../extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc)
+also supports the CUDA backend. However, as neither CUBIN nor PTX are a suitable
+input format for the [kernel fusion JIT compiler](KernelFusionJIT.md), a
+suitable IR has to be added as an additional device binary.
+
+Therefore, in case kernel fusion should be performed for the CUDA backend, the
+user needs to specify the additional flag `-fsycl-embed-ir` during compilation,
+to add LLVM IR as an additional device binary. When the flag `-fsycl-embed-ir`
+is specified, the LLVM IR produced by Clang for the CUDA backend device
+compilation is added to the fat binary file. To this end, the resulting
+file-table from `sycl-post-link` is additionally passed to the
+`clang-offload-wrapper`, creating a wrapper object with target `llvm_nvptx64`.
+
+This device binary in LLVM IR format can be retrieved by the SYCL runtime and
+used by the kernel fusion JIT compiler. The resulting fused kernel is compiled
+to PTX assembly by the kernel fusion JIT compiler at runtime.
+
+Note that the device binary in LLVM IR does not replace the device binary in
+CUBIN/PTX format, but is embed in addition to it.
+
 ### Integration with SPIR-V format
 
 This section explains how to generate SPIR-V specific types and operations from
diff --git a/sycl/doc/design/images/DevicePTXProcessing.svg b/sycl/doc/design/images/DevicePTXProcessing.svg
index df690ec5fb08b..79a9c5e5c4fc9 100644
--- a/sycl/doc/design/images/DevicePTXProcessing.svg
+++ b/sycl/doc/design/images/DevicePTXProcessing.svg
@@ -1,20 +1,20 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:xlink="http://www.w3.org/1999/xlink"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
    sodipodi:docname="DevicePTXProcessing.svg"
-   inkscape:version="0.92.3 (2405546, 2018-03-11)"
+   inkscape:version="1.2.2 (732a01da63, 2022-12-09)"
    id="svg8"
    version="1.1"
    viewBox="0 0 205.79753 221.03191"
    height="221.03191mm"
-   width="205.79753mm">
+   width="205.79753mm"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:xlink="http://www.w3.org/1999/xlink"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:dc="http://purl.org/dc/elements/1.1/">
   <defs
      id="defs2">
     <marker
@@ -2197,6 +2197,20 @@
          d="M 5.77,0 -2.88,5 V -5 Z"
          id="path4942-3" />
     </marker>
+    <marker
+       inkscape:isstock="true"
+       style="overflow:visible"
+       id="marker4944-3-5"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM">
+      <path
+         transform="scale(0.4)"
+         style="fill:#4472c4;fill-opacity:1;fill-rule:evenodd;stroke:#4472c4;stroke-width:1pt;stroke-opacity:1"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path4942-3-2" />
+    </marker>
   </defs>
   <sodipodi:namedview
      fit-margin-bottom="0"
@@ -2204,16 +2218,16 @@
      fit-margin-left="0"
      fit-margin-top="0"
      inkscape:window-maximized="1"
-     inkscape:window-y="0"
-     inkscape:window-x="1920"
-     inkscape:window-height="1163"
-     inkscape:window-width="1920"
+     inkscape:window-y="-8"
+     inkscape:window-x="-8"
+     inkscape:window-height="1377"
+     inkscape:window-width="2560"
      showgrid="false"
      inkscape:document-rotation="0"
      inkscape:current-layer="layer1"
      inkscape:document-units="mm"
-     inkscape:cy="330.42371"
-     inkscape:cx="228.34098"
+     inkscape:cy="330.71429"
+     inkscape:cx="228.57143"
      inkscape:zoom="1.4"
      inkscape:pageshadow="2"
      inkscape:pageopacity="0.0"
@@ -2222,7 +2236,10 @@
      pagecolor="#ffffff"
      id="base"
      inkscape:snap-bbox="false"
-     inkscape:snap-global="false" />
+     inkscape:snap-global="false"
+     inkscape:showpageshadow="2"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1" />
   <metadata
      id="metadata5">
     <rdf:RDF>
@@ -2318,16 +2335,14 @@
        d="m 125.31238,20.80355 0.32455,-1.291166 1.29117,-0.324556 z" />
     <text
        id="text899"
-       style="font-variant:normal;font-weight:normal;font-size:4.95300007px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.35277799"
+       style="font-variant:normal;font-weight:normal;font-size:4.953px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="99.965302"
-       y="16.988504">
-      <tspan
+       y="16.988504"><tspan
          id="tspan897"
          sodipodi:role="line"
-         style="stroke-width:0.35277799"
+         style="stroke-width:0.352778"
          x="99.965302"
-         y="16.988504">File table</tspan>
-    </text>
+         y="16.988504">File table</tspan></text>
     <path
        inkscape:connector-curvature="0"
        id="path901"
@@ -2428,13 +2443,11 @@
        id="text1309"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="101.31163"
-       y="84.249756">
-      <tspan
+       y="84.249756"><tspan
          sodipodi:role="line"
          id="tspan1686"
          x="101.31163"
-         y="84.249756">Clang</tspan>
-    </text>
+         y="84.249756">Clang</tspan></text>
     <g
        id="g1323"
        transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)">
@@ -2495,13 +2508,12 @@
         <text
            id="text1385"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,360.07,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,360.07,89.904)"
+           y="0"><tspan
              id="tspan1383"
              sodipodi:role="line"
              y="0"
-             x="0 7.7220001 15.444 23.166 30.94416">clang</tspan>
-        </text>
+             x="0 7.7220001 15.444 23.166 30.94416">clang</tspan></text>
       </g>
     </g>
     <g
@@ -2513,12 +2525,10 @@
         <text
            id="text1397"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,398.71,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,398.71,89.904)"><tspan
              id="tspan1395"
              y="0"
-             x="0">-</tspan>
-        </text>
+             x="0">-</tspan></text>
       </g>
     </g>
     <g
@@ -2530,13 +2540,12 @@
         <text
            id="text1409"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,406.51,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,406.51,89.904)"
+           y="0"><tspan
              id="tspan1407"
              sodipodi:role="line"
              y="0"
-             x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.528561">offload</tspan>
-        </text>
+             x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.528561">offload</tspan></text>
       </g>
     </g>
     <g
@@ -2548,12 +2557,10 @@
         <text
            id="text1421"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,460.78,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,460.78,89.904)"><tspan
              id="tspan1419"
              y="0"
-             x="0">-</tspan>
-        </text>
+             x="0">-</tspan></text>
       </g>
     </g>
     <g
@@ -2565,13 +2572,12 @@
         <text
            id="text1433"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Consolas;-inkscape-font-specification:Consolas;writing-mode:lr-tb;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,468.58,89.904)">
-          <tspan
+           transform="matrix(1,0,0,-1,468.58,89.904)"
+           y="0"><tspan
              id="tspan1431"
              sodipodi:role="line"
              y="0"
-             x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.458359">wrapper</tspan>
-        </text>
+             x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.458359">wrapper</tspan></text>
       </g>
     </g>
     <g
@@ -2629,13 +2635,11 @@
        id="text1509"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="81.597427"
-       y="60.97049">
-      <tspan
+       y="60.97049"><tspan
          sodipodi:role="line"
          id="tspan2072"
          x="81.597427"
-         y="60.97049">PTX target processing</tspan>
-    </text>
+         y="60.97049">PTX target processing</tspan></text>
     <g
        id="g2083"
        transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)">
@@ -2697,13 +2701,12 @@
         <text
            id="text2129"
            style="font-variant:normal;font-weight:normal;font-size:14.04px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,397.06,43.272)">
-          <tspan
+           transform="matrix(1,0,0,-1,397.06,43.272)"
+           y="0"><tspan
              id="tspan2127"
              sodipodi:role="line"
              y="0"
-             x="0 12.11652 16.79184 23.517 30.831841 38.146679 45.138599 50.038559 53.141399 60.540482 67.911484 71.267036 78.258957 84.099602">Wrapper object</tspan>
-        </text>
+             x="0 12.11652 16.79184 23.517 30.831841 38.146679 45.138599 50.038559 53.141399 60.540482 67.911484 71.267036 78.258957 84.099602">Wrapper object</tspan></text>
       </g>
     </g>
     <g
@@ -2767,13 +2770,12 @@
         <text
            id="text2177"
            style="font-variant:normal;font-weight:normal;font-size:14.064px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none"
-           transform="matrix(1,0,0,-1,407.18,26.328)">
-          <tspan
+           transform="matrix(1,0,0,-1,407.18,26.328)"
+           y="0"><tspan
              id="tspan2175"
              sodipodi:role="line"
              y="0"
-             x="0 8.6493597 15.484464 21.841393 25.076113 30.968927 37.930607 41.151264 46.917503 54.329231 61.712833">Device code</tspan>
-        </text>
+             x="0 8.6493597 15.484464 21.841393 25.076113 30.968927 37.930607 41.151264 46.917503 54.329231 61.712833">Device code</tspan></text>
       </g>
     </g>
     <path
@@ -2790,13 +2792,11 @@
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193"
        x="28.692085"
-       y="17.476978">
-      <tspan
+       y="17.476978"><tspan
          sodipodi:role="line"
          id="tspan1595"
          x="28.692085"
-         y="17.476978">(from sycl-post-link)</tspan>
-    </text>
+         y="17.476978">(from sycl-post-link)</tspan></text>
     <g
        id="g1668"
        transform="translate(-29.00824,13.042864)">
@@ -2814,13 +2814,11 @@
          style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
          id="text899-9"
          x="73.031509"
-         y="68.195061">
-        <tspan
+         y="68.195061"><tspan
            y="68.195061"
            x="73.031509"
            id="tspan1662"
-           sodipodi:role="line">libspirv.bc</tspan>
-      </text>
+           sodipodi:role="line">libspirv.bc</tspan></text>
     </g>
     <g
        id="g2345"
@@ -2839,13 +2837,11 @@
          y="81.860855"
          x="71.144508"
          style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
-         id="text899-0">
-        <tspan
+         id="text899-0"><tspan
            y="81.860855"
            x="71.144508"
            id="tspan2332"
-           sodipodi:role="line">libdevice.bc</tspan>
-      </text>
+           sodipodi:role="line">libdevice.bc</tspan></text>
     </g>
     <g
        id="g1263-0"
@@ -2902,13 +2898,11 @@
        id="text1309-1"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="102.06758"
-       y="113.47669">
-      <tspan
+       y="113.47669"><tspan
          sodipodi:role="line"
          id="tspan2597"
          x="102.06758"
-         y="113.47669">ptxas</tspan>
-    </text>
+         y="113.47669">ptxas</tspan></text>
     <g
        id="g1263-02"
        transform="matrix(0.35277777,0,0,-0.35277777,-7.4064096,246.33646)">
@@ -2964,13 +2958,11 @@
        id="text1309-7"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="102.06758"
-       y="137.37424">
-      <tspan
+       y="137.37424"><tspan
          sodipodi:role="line"
          id="tspan2611"
          x="102.06758"
-         y="137.37424">fatbin</tspan>
-    </text>
+         y="137.37424">fatbin</tspan></text>
     <path
        style="fill:#538f7b;fill-opacity:1;fill-rule:evenodd;stroke:#538f7b;stroke-width:0.60211;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker3606)"
        d="m 109.38615,118.87647 v 9.24756"
@@ -2980,24 +2972,20 @@
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1"
        x="110.58295"
-       y="97.450661">
-      <tspan
+       y="97.450661"><tspan
          y="97.450661"
          x="110.58295"
          sodipodi:role="line"
-         id="tspan3794">ptx</tspan>
-    </text>
+         id="tspan3794">ptx</tspan></text>
     <text
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-3"
        x="111.25836"
-       y="125.55264">
-      <tspan
+       y="125.55264"><tspan
          y="125.55264"
          x="111.25836"
          sodipodi:role="line"
-         id="tspan3916">cubin</tspan>
-    </text>
+         id="tspan3916">cubin</tspan></text>
     <path
        style="fill:none;fill-rule:evenodd;stroke:#538f7b;stroke-width:0.616;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker4036)"
        d="M 108.63556,91.358218 V 104.16258"
@@ -3014,13 +3002,11 @@
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-4"
        x="110.90725"
-       y="73.327454">
-      <tspan
+       y="73.327454"><tspan
          sodipodi:role="line"
          id="tspan4924"
          x="110.90725"
-         y="73.327454">LLVM IR</tspan>
-    </text>
+         y="73.327454">LLVM IR</tspan></text>
     <path
        style="fill:none;fill-rule:evenodd;stroke:#4472c4;stroke-width:0.616;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker4944)"
        d="m 72.28691,79.325713 h 9.08716 v 5.34539 h 6.94901"
@@ -3036,35 +3022,29 @@
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-3-9"
        x="111.2326"
-       y="150.29218">
-      <tspan
+       y="150.29218"><tspan
          y="150.29218"
          x="111.2326"
          sodipodi:role="line"
-         id="tspan2303">CUDA fatbin</tspan>
-    </text>
+         id="tspan2303">CUDA fatbin</tspan></text>
     <text
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-5"
        x="31.587437"
-       y="216.68318">
-      <tspan
+       y="216.68318"><tspan
          sodipodi:role="line"
          id="tspan1653"
          x="31.587437"
-         y="216.68318">(to host linker)</tspan>
-    </text>
+         y="216.68318">(to host linker)</tspan></text>
     <text
        id="text1309-0"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.05694px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="91.899139"
-       y="88.973877">
-      <tspan
+       y="88.973877"><tspan
          sodipodi:role="line"
          id="tspan1706"
          x="91.899139"
-         y="88.973877">(nvptx backend)</tspan>
-    </text>
+         y="88.973877">(nvptx backend)</tspan></text>
     <path
        inkscape:connector-curvature="0"
        id="path995-7"
@@ -3140,35 +3120,29 @@
        id="text1509-0"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="86.807594"
-       y="33.349266">
-      <tspan
+       y="33.349266"><tspan
          sodipodi:role="line"
          id="tspan2072-3"
          x="86.807594"
-         y="33.349266">file-table-tform</tspan>
-    </text>
+         y="33.349266">file-table-tform</tspan></text>
     <text
        id="text1509-0-9"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="90.741898"
-       y="37.806171">
-      <tspan
+       y="37.806171"><tspan
          sodipodi:role="line"
          id="tspan2072-3-5"
          x="90.741898"
-         y="37.806171">(Copy &quot;Code&quot;)</tspan>
-    </text>
+         y="37.806171">(Copy &quot;Code&quot;)</tspan></text>
     <text
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-4-1"
        x="110.37068"
-       y="47.484673">
-      <tspan
+       y="47.484673"><tspan
          sodipodi:role="line"
          id="tspan4924-3"
          x="110.37068"
-         y="47.484673">LLVM IR</tspan>
-    </text>
+         y="47.484673">LLVM IR</tspan></text>
     <path
        inkscape:connector-curvature="0"
        id="path995-7-4"
@@ -3244,40 +3218,40 @@
        id="text1509-0-5"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="87.676941"
-       y="161.4454">
-      <tspan
+       y="161.4454"><tspan
          sodipodi:role="line"
          id="tspan2072-3-0"
          x="87.676941"
-         y="161.4454">file-table-tform</tspan>
-    </text>
+         y="161.4454">file-table-tform</tspan></text>
     <text
        id="text1509-0-9-5"
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        x="87.907066"
-       y="165.9023">
-      <tspan
+       y="165.9023"><tspan
          sodipodi:role="line"
          id="tspan2072-3-5-4"
          x="87.907066"
-         y="165.9023">(Replace &quot;Code&quot;)</tspan>
-    </text>
+         y="165.9023">(Replace &quot;Code&quot;)</tspan></text>
     <text
        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778"
        id="text2193-1-3-9-2"
        x="110.86103"
-       y="175.67789">
-      <tspan
+       y="175.67789"><tspan
          y="175.67789"
          x="110.86103"
          sodipodi:role="line"
-         id="tspan2303-1">File table</tspan>
-    </text>
+         id="tspan2303-1">File table</tspan></text>
     <path
        style="fill:none;fill-rule:evenodd;stroke:#4472c4;stroke-width:0.616;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker4944-3)"
        d="m 108.65966,22.915376 h 69.56335 l 0,138.959974 h -9.30396"
        id="path4926-6"
        inkscape:connector-curvature="0"
        sodipodi:nodetypes="cccc" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#4472c4;stroke-width:0.616;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#marker4944-3-5)"
+       d="m 177.4031,22.943668 h 14.37883 L 191.97091,188.17299 169.2488,187.984"
+       id="path4926-6-8"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cccc" />
   </g>
 </svg>

From b4d39684972607d89b25549af6396ac5278d51b9 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Tue, 18 Apr 2023 13:02:35 +0100
Subject: [PATCH 22/25] Update kernel fusion design document

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/doc/design/KernelFusionJIT.md | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
index 6678b9359f832..dc458c4f2b71a 100644
--- a/sycl/doc/design/KernelFusionJIT.md
+++ b/sycl/doc/design/KernelFusionJIT.md
@@ -162,11 +162,20 @@ The metadata is attached to a function that will become the fused kernel:
 
 ### Support for non SPIR-V targets
 
-Non SPIR-V targets (NVPTX / AMDGCN) are not supported at the moment as they cannot ingest a SPIR-V module. However, we are looking into adding support for these targets once the initial SPIR-V based path is operational.
+Fusion is currently supported for the NVPTX/CUDA backend. 
 
-In this scenario, two options are possible to add JIT support:
+As this backend cannot ingest a SPIR-V module, additional changes to the
+compilation flow are necessary. During static compilation the LLVM module for
+this backend is stored in addition to the finalized binary. 
 
- - During static compilation we store the LLVM module on top of the finalized binary. This behavior could be controlled by a flag to avoid a too important binary inflation. Then, during the fusion process, the JIT will load that LLVM IR and finalize the fused kernel to the final target as driven by the PI plugin.
- - SPIR-V ingestion support is added for these targets. The module to be loaded could then be the generic SPIR-V module. This path would however exclude target specific optimizations written in user's code. The current state of the SPIR-V translator does not allow this at the moment and significant work is needed to add this support.
+This behavior is controlled by the `-fsycl-embed-ir` flag to avoid binary
+inflation in case kernel fusion is not used. If users want to use kernel fusion
+at runtime on the NVPTX/CUDA backend, they need to pass the `-fsycl-embed-ir`
+flag during static compilation. 
 
-In these cases, PI will need to be extended to allow to somehow drive the JIT process, so it is tailored to the plugin target needs.
+During the fusion process at runtime , the JIT will load the LLVM IR and
+finalize the fused kernel to the final target. More information is available
+[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support). 
+
+Support for the AMD GPU/HIP/AMDGCN backend is not yet implemented, but could
+follow an approach similar to the NVPTX/CUDA backend.

From a7e13694e3b2418274c3f73f1ef51a50a905e00d Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Thu, 20 Apr 2023 16:56:57 +0100
Subject: [PATCH 23/25] Fix formatting for test

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/test-e2e/KernelFusion/math_function.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/KernelFusion/math_function.cpp b/sycl/test-e2e/KernelFusion/math_function.cpp
index f2dc1541f69cb..ef9357672977e 100644
--- a/sycl/test-e2e/KernelFusion/math_function.cpp
+++ b/sycl/test-e2e/KernelFusion/math_function.cpp
@@ -4,7 +4,7 @@
 // UNSUPPORTED: hip
 // REQUIRES: fusion
 
-// Test fusion of a kernel using a math function. 
+// Test fusion of a kernel using a math function.
 
 #include <sycl/sycl.hpp>
 
@@ -57,7 +57,7 @@ int main() {
 
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
-    assert(out[i] == (-1.0 * static_cast<float>(i*3)) && "Computation error");
+    assert(out[i] == (-1.0 * static_cast<float>(i * 3)) && "Computation error");
   }
 
   return 0;

From 88b4ada8df28d99de8daba1874f5c9e23d66f596 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Thu, 27 Apr 2023 13:35:59 +0100
Subject: [PATCH 24/25] Rebase on branch 'sycl'

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/source/detail/jit_compiler.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index ae20134d46b8c..ed64e35c06509 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -40,7 +40,7 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
 }
 
 ::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) {
-  auto Backend = Queue->getDeviceImplPtr()->getPlugin().getBackend();
+  auto Backend = Queue->getDeviceImplPtr()->getBackend();
   switch (Backend) {
   case backend::ext_oneapi_level_zero:
   case backend::opencl:
@@ -58,8 +58,8 @@ std::pair<const RTDeviceBinaryImage *, RT::PiProgram>
 retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
   auto KernelName = KernelCG->getKernelName();
 
-  bool isNvidia = Queue->getDeviceImplPtr()->getPlugin().getBackend() ==
-                  backend::ext_oneapi_cuda;
+  bool isNvidia =
+      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda;
   if (isNvidia) {
     auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
     std::vector<kernel_id> KernelIds{KernelID};

From 4877a40089e5f7825297e0f8c40cbc4e35d0316c Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Thu, 4 May 2023 13:28:42 +0100
Subject: [PATCH 25/25] Address PR feedback and formatting

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 sycl/doc/design/KernelFusionJIT.md             |  2 +-
 sycl/source/detail/device_info.hpp             | 16 ++++++++++++----
 sycl/source/detail/scheduler/commands.hpp      |  9 ++++++---
 sycl/source/detail/scheduler/graph_builder.cpp |  1 -
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
index dc458c4f2b71a..53367b5a930b3 100644
--- a/sycl/doc/design/KernelFusionJIT.md
+++ b/sycl/doc/design/KernelFusionJIT.md
@@ -173,7 +173,7 @@ inflation in case kernel fusion is not used. If users want to use kernel fusion
 at runtime on the NVPTX/CUDA backend, they need to pass the `-fsycl-embed-ir`
 flag during static compilation. 
 
-During the fusion process at runtime , the JIT will load the LLVM IR and
+During the fusion process at runtime, the JIT will load the LLVM IR and
 finalize the fused kernel to the final target. More information is available
 [here](./CompilerAndRuntimeDesign.md#kernel-fusion-support). 
 
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 1a34178a4a3c2..85cedb47e6e92 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -105,10 +105,18 @@ affinityDomainToString(info::partition_affinity_domain AffinityDomain) {
 }
 
 // Mapping expected SYCL return types to those returned by PI calls
-template <typename T> struct sycl_to_pi { using type = T; };
-template <> struct sycl_to_pi<bool> { using type = pi_bool; };
-template <> struct sycl_to_pi<device> { using type = RT::PiDevice; };
-template <> struct sycl_to_pi<platform> { using type = RT::PiPlatform; };
+template <typename T> struct sycl_to_pi {
+  using type = T;
+};
+template <> struct sycl_to_pi<bool> {
+  using type = pi_bool;
+};
+template <> struct sycl_to_pi<device> {
+  using type = RT::PiDevice;
+};
+template <> struct sycl_to_pi<platform> {
+  using type = RT::PiPlatform;
+};
 
 // Mapping fp_config device info types to the values used to check fp support
 template <typename Param> struct check_fp_support {};
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 641345aa74353..5f5f68ed1bf56 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -290,12 +290,15 @@ class Command {
   // Memory is allocated in this method and released in destructor.
   void copySubmissionCodeLocation();
 
-  /// Clear all dependency events for device and host dependencies. This should
-  /// only be used if a command is about to be deleted without being executed
-  /// before that.
+  /// Clear all dependency events This should only be used if a command is about
+  /// to be deleted without being executed before that. As of now, the only
+  /// valid use case for this function is in kernel fusion, where the fused
+  /// kernel commands are replaced by the fused command without ever being
+  /// executed.
   void clearAllDependencies() {
     MPreparedDepsEvents.clear();
     MPreparedHostDepsEvents.clear();
+    MDeps.clear();
   }
 
   /// Contains list of dependencies(edges)
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index a404fe707698b..c40b5c0f69fdc 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1395,7 +1395,6 @@ void Scheduler::GraphBuilder::removeNodeFromGraph(
     Dep.MDepCommand->MUsers.erase(Node);
   }
 
-  Node->MDeps.clear();
   // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called
   // from the destructor of the command to delete the dependencies of the
   // command this command depends on.