From 5b2d0a01e76dfed0e6e7aac794f78dd91817cd74 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Mon, 9 Jan 2023 13:00:37 +0000 Subject: [PATCH 01/25] [SYCL][Fusion] Embed LLVM IR for SYCL for Nvidia Signed-off-by: Lukas Sommer --- clang/include/clang/Driver/Action.h | 7 ++++++- clang/include/clang/Driver/Options.td | 2 ++ clang/lib/Driver/Action.cpp | 6 +++--- clang/lib/Driver/Driver.cpp | 10 ++++++++++ clang/lib/Driver/ToolChains/Clang.cpp | 10 +++++++++- 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h index 2c9f59cdde9b0..9dab2e32b2ffc 100644 --- a/clang/include/clang/Driver/Action.h +++ b/clang/include/clang/Driver/Action.h @@ -660,9 +660,14 @@ class OffloadUnbundlingJobAction final : public JobAction { class OffloadWrapperJobAction : public JobAction { void anchor() override; + bool EmbedIR; + public: OffloadWrapperJobAction(ActionList &Inputs, types::ID Type); - OffloadWrapperJobAction(Action *Input, types::ID OutputType); + OffloadWrapperJobAction(Action *Input, types::ID OutputType, + bool IsEmbeddedIR = false); + + bool isEmbeddedIR() const { return EmbedIR; } static bool classof(const Action *A) { return A->getKind() == OffloadWrapperJobClass; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index aa56bc605a7a8..6afef8e821e08 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2976,6 +2976,8 @@ def fintelfpga : Flag<["-"], "fintelfpga">, Group, HelpText<"Perform ahead-of-time compilation for FPGA">; def fsycl_device_only : Flag<["-"], "fsycl-device-only">, Flags<[CoreOption]>, HelpText<"Compile SYCL kernels for device">; +def fsycl_embed_ir : Flag<["-"], "fsycl-embed-ir">, Flags<[CoreOption]>, + HelpText<"Embed LLVM IR for runtime kernel fusion">; defm sycl_esimd_force_stateless_mem : BoolFOption<"sycl-esimd-force-stateless-mem", LangOpts<"SYCLESIMDForceStatelessMem">, DefaultFalse, PosFlag( + PostLinkAction, types::TY_Object, true); + DA.add(*WrapBitcodeAction, *TC, BoundArch, Action::OFK_SYCL); + } bool NoRDCFatStaticArchive = !IsRDC && FullDeviceLinkAction->getType() == types::TY_Tempfilelist; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 42e5a64dc2993..b2883f49a5a31 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9253,6 +9253,14 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA, createArgString("-link-opts="); } + bool IsEmbeddedIR = cast(JA).isEmbeddedIR(); + if (IsEmbeddedIR) { + // When the offload-wrapper is called to embed LLVM IR, add a prefix to + // the target triple to distinguish the LLVM IR from the actual device + // binary for that target. + TargetTripleOpt = ("llvm_" + TargetTripleOpt).str(); + } + WrapperArgs.push_back( C.getArgs().MakeArgString(Twine("-target=") + TargetTripleOpt)); @@ -9274,7 +9282,7 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA, assert(I.isFilename() && "Invalid input."); if (I.getType() == types::TY_Tempfiletable || - I.getType() == types::TY_Tempfilelist) + I.getType() == types::TY_Tempfilelist || IsEmbeddedIR) // wrapper actual input files are passed via the batch job file table: WrapperArgs.push_back(C.getArgs().MakeArgString("-batch")); WrapperArgs.push_back(C.getArgs().MakeArgString(I.getFilename())); From ee8c29b0f6d2a5e8195238f9c49dd4688004dd89 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 10 Jan 2023 13:29:22 +0000 Subject: [PATCH 02/25] Enable LLVM IR as alternative fusion input format; Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/CMakeLists.txt | 4 +- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 5 +- .../lib/translation/LoadKernels.cpp | 116 ++++++++++++++++++ .../lib/translation/LoadKernels.h | 36 ++++++ .../lib/translation/SPIRVLLVMTranslation.cpp | 94 +++----------- .../lib/translation/SPIRVLLVMTranslation.h | 13 +- sycl/source/detail/jit_compiler.cpp | 108 +++++++++++----- 7 files changed, 251 insertions(+), 125 deletions(-) create mode 100644 sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp create mode 100644 sycl-fusion/jit-compiler/lib/translation/LoadKernels.h diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index de6a73b1eab3d..92f0cefd68634 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -2,13 +2,15 @@ add_llvm_library(sycl-fusion lib/KernelFusion.cpp lib/JITContext.cpp + lib/translation/LoadKernels.cpp lib/translation/SPIRVLLVMTranslation.cpp lib/fusion/FusionPipeline.cpp lib/fusion/FusionHelper.cpp lib/fusion/ModuleHelper.cpp lib/helper/ConfigHelper.cpp - LINK_COMPONENTS + LINK_COMPONENTS + BitReader Core Support Analysis diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index 56ea9401c465b..fc168587738b5 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -15,6 +15,7 @@ #include "fusion/FusionPipeline.h" #include "helper/ConfigHelper.h" #include "helper/ErrorHandling.h" +#include "translation/LoadKernels.h" #include "translation/SPIRVLLVMTranslation.h" #include #include @@ -97,8 +98,8 @@ FusionResult KernelFusion::fuseKernels( // Load all input kernels from their respective SPIR-V modules into a single // LLVM IR module. llvm::Expected> ModOrError = - translation::SPIRVLLVMTranslator::loadSPIRVKernels( - *JITCtx.getLLVMContext(), ModuleInfo.kernels()); + translation::KernelLoader::loadKernels(*JITCtx.getLLVMContext(), + ModuleInfo.kernels()); if (auto Error = ModOrError.takeError()) { return errorToFusionResult(std::move(Error), "SPIR-V translation failed"); } diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp new file mode 100644 index 0000000000000..cabf1f6c44e9f --- /dev/null +++ b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp @@ -0,0 +1,116 @@ +//==-------------------------- LoadKernels.cpp ----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LoadKernels.h" +#include "SPIRVLLVMTranslation.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/MemoryBuffer.h" + +using namespace jit_compiler; +using namespace jit_compiler::translation; +using namespace llvm; + +llvm::Expected> +KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx, + std::vector &Kernels) { + std::unique_ptr Result{nullptr}; + bool First = true; + DenseSet ParsedBinaries; + size_t AddressBits = 0; + for (auto &Kernel : Kernels) { + // FIXME: Currently, we use the front of the list. + // Do we need to iterate to find the most suitable + // SPIR-V module? + SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo; + + const unsigned char *ModulePtr = BinInfo.BinaryStart; + size_t ModuleSize = BinInfo.BinarySize; + BinaryBlob BinBlob{ModulePtr, ModuleSize}; + if (ParsedBinaries.contains(BinBlob)) { + // Multiple kernels can be stored in the same SPIR-V or LLVM IR module. + // If we encountered the same binary module before, skip. + // NOTE: We compare the pointer as well as the size, in case + // a previous kernel only referenced part of the SPIR-V/LLVM IR module. + // Not sure this can actually happen, but better safe than sorry. + continue; + } + // Simply load and translate the SPIR-V into the currently still empty + // module. + std::unique_ptr NewMod; + + switch (BinInfo.Format) { + case BinaryFormat::LLVM: { + auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel); + if (auto Err = ModOrError.takeError()) { + return std::move(Err); + } + NewMod = std::move(*ModOrError); + break; + } + case BinaryFormat::SPIRV: { + auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel); + if (auto Err = ModOrError.takeError()) { + return std::move(Err); + } + NewMod = std::move(*ModOrError); + break; + } + default: { + return createStringError( + inconvertibleErrorCode(), + "Failed to load kernel from unsupported input format"); + } + } + + // We do not assume that the input binary information has the address bits + // set, but rather retrieve this information from the SPIR-V/LLVM module's + // data-layout. + BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits(); + + if (First) { + // We can simply assign the module we just loaded from SPIR-V to the + // empty pointer on the first iteration. + Result = std::move(NewMod); + // The first module will dictate the address bits for the remaining. + AddressBits = BinInfo.AddressBits; + First = false; + } else { + // We have already loaded some module, so now we need to + // link the module we just loaded with the result so far. + // FIXME: We allow duplicates to be overridden by the module + // read last. This could cause problems if different modules contain + // definitions with the same name, but different body/content. + // Check that this is not problematic. + Linker::linkModules(*Result, std::move(NewMod), + Linker::Flags::OverrideFromSrc); + if (AddressBits != BinInfo.AddressBits) { + return createStringError( + inconvertibleErrorCode(), + "Number of address bits between SPIR-V modules does not match"); + } + } + } + return std::move(Result); +} + +llvm::Expected> +KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { + auto &BinInfo = Kernel.BinaryInfo; + llvm::StringRef RawData(reinterpret_cast(BinInfo.BinaryStart), + BinInfo.BinarySize); + return llvm::parseBitcodeFile( + MemoryBuffer::getMemBuffer(RawData)->getMemBufferRef(), LLVMCtx); +} + +llvm::Expected> +KernelLoader::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { + return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel); +} diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h new file mode 100644 index 0000000000000..5720abf09bfd7 --- /dev/null +++ b/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h @@ -0,0 +1,36 @@ +//==-- LoadKernels.h - Load LLVM IR for SYCL kernels in different formats -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Kernel.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include +#include + +namespace jit_compiler { +namespace translation { + +class KernelLoader { + +public: + static llvm::Expected> + loadKernels(llvm::LLVMContext &LLVMCtx, std::vector &Kernels); + +private: + /// + /// Pair of address and size to represent a binary blob. + using BinaryBlob = std::pair; + + static llvm::Expected> + loadLLVMKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel); + + static llvm::Expected> + loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel); +}; +} // namespace translation +} // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp index 77217419e98bd..fc481f23b141d 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp @@ -16,7 +16,6 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/Linker/Linker.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -86,12 +85,19 @@ SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() { return Opts; } -Expected> -SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx, - BinaryBlob Input) { - // Create an input stream for the binary blob. +Expected> +SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { + std::unique_ptr Result{nullptr}; + + SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo; + assert(BinInfo.Format == BinaryFormat::SPIRV && + "Only SPIR-V supported as input"); + + // Create an input stream for the SPIR-V binary. std::stringstream SPIRStream( - std::string(reinterpret_cast(Input.first), Input.second), + std::string(reinterpret_cast(BinInfo.BinaryStart), + BinInfo.BinarySize), std::ios_base::in | std::ios_base::binary); std::string ErrMsg; // Create a raw pointer. readSpirv accepts a reference to a pointer, @@ -105,77 +111,13 @@ SPIRVLLVMTranslator::readAndTranslateSPIRV(LLVMContext &LLVMCtx, "Failed to load and translate SPIR-V module with error %s", ErrMsg.c_str()); } - return std::unique_ptr(LLVMMod); -} + std::unique_ptr NewMod{LLVMMod}; -Expected> -SPIRVLLVMTranslator::loadSPIRVKernels(llvm::LLVMContext &LLVMCtx, - std::vector &Kernels) { - std::unique_ptr Result{nullptr}; - bool First = true; - DenseSet ParsedSPIRVModules; - size_t AddressBits = 0; - for (auto &Kernel : Kernels) { - // FIXME: Currently, we use the front of the list. - // Do we need to iterate to find the most suitable - // SPIR-V module? - SYCLKernelBinaryInfo &BinInfo = Kernel.BinaryInfo; - // TODO(Lukas, ONNX-399): Also support LLVM IR as input but simply skipping - // the translation from SPIR-V to LLVM. - assert(BinInfo.Format == BinaryFormat::SPIRV && - "Only SPIR-V supported as input"); - const unsigned char *SPRModulePtr = BinInfo.BinaryStart; - size_t SPRModuleSize = BinInfo.BinarySize; - BinaryBlob BinBlob{SPRModulePtr, SPRModuleSize}; - if (ParsedSPIRVModules.contains(BinBlob)) { - // Multiple kernels can be stored in the same SPIR-V module. - // If we encountered the same SPIR-V module before, skip. - // NOTE: We compare the pointer as well as the size, in case - // a previous kernel only referenced part of the SPIR-V module. - // Not sure this can actually happen, but better safe than sorry. - continue; - } - // Simply load and translate the SPIR-V into the currently still empty - // module. - PROPAGATE_ERROR(NewMod, readAndTranslateSPIRV(LLVMCtx, BinBlob)); - - // We do not assume that the input binary information has the address bits - // set, but rather retrieve this information from the SPIR-V/LLVM module's - // data-layout. - BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits(); - assert((First || BinInfo.AddressBits == AddressBits) && - "Address bits do not match"); - // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or - // 'work_group_size_hint' from metadata attached to the kernel function and - // store it in the SYCLKernelInfo. - // TODO(Lukas, ONNX-399): Validate that DPC++ used metadata to represent - // that information. - restoreKernelAttributes(NewMod.get(), Kernel); - - if (First) { - // We can simply assign the module we just loaded from SPIR-V to the - // empty pointer on the first iteration. - Result = std::move(NewMod); - // The first module will dictate the address bits for the remaining. - AddressBits = BinInfo.AddressBits; - First = false; - } else { - // We have already loaded some module, so now we need to - // link the module we just loaded with the result so far. - // FIXME: We allow duplicates to be overridden by the module - // read last. This could cause problems if different modules contain - // definitions with the same name, but different body/content. - // Check that this is not problematic. - Linker::linkModules(*Result, std::move(NewMod), - Linker::Flags::OverrideFromSrc); - if (AddressBits != BinInfo.AddressBits) { - return createStringError( - inconvertibleErrorCode(), - "Number of address bits between SPIR-V modules does not match"); - } - } - } - return std::move(Result); + // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or + // 'work_group_size_hint' from metadata attached to the kernel function and + // store it in the SYCLKernelInfo. + restoreKernelAttributes(NewMod.get(), Kernel); + return std::move(NewMod); } Expected diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h index 5f1d416e45150..d82a9cc82466c 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h @@ -25,8 +25,7 @@ class SPIRVLLVMTranslator { /// /// Load a list of SPIR-V kernels into a single LLVM module. static llvm::Expected> - loadSPIRVKernels(llvm::LLVMContext &LLVMCtx, - std::vector &Kernels); + loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel); /// /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and @@ -35,10 +34,6 @@ class SPIRVLLVMTranslator { JITContext &JITCtx); private: - /// - /// Pair of address and size to represent a binary blob. - using BinaryBlob = std::pair; - /// /// Get an attribute value consisting of NumValues scalar constant integers /// from the MDNode. @@ -53,12 +48,6 @@ class SPIRVLLVMTranslator { /// - work_group_size_hint static void restoreKernelAttributes(llvm::Module *Mod, SYCLKernelInfo &Info); - /// - /// Read the given SPIR-V binary and translate it to a new LLVM module - /// associated with the given context. - static llvm::Expected> - readAndTranslateSPIRV(llvm::LLVMContext &LLVMCtx, BinaryBlob Input); - /// /// Default settings for the SPIRV translation options. static SPIRV::TranslatorOpts &translatorOpts(); diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 4e092ef12c4e3..d8a59a58f8a36 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -39,6 +39,70 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) { } } +std::pair +retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { + auto KernelName = KernelCG->getKernelName(); + + bool isNvidia = Queue->getDeviceImplPtr()->getPlugin().getBackend() == + backend::ext_oneapi_cuda; + if (isNvidia) { + auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName); + std::vector KernelIds; + KernelIds.push_back(KernelID); + auto DeviceImages = + ProgramManager::getInstance().getRawDeviceImages(KernelIds); + const RTDeviceBinaryImage *DeviceImage = nullptr; + for (auto *DI : DeviceImages) { + // We are looking for a device image with LLVM IR format and target spec + // "llvm_nvptx64", which has been set by the offload-wrapper action. + if (DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE && + DI->getRawData().DeviceTargetSpec == std::string("llvm_nvptx64")) { + DeviceImage = DI; + break; + } + } + if (!DeviceImage) { + return {nullptr, nullptr}; + } + auto ContextImpl = Queue->getContextImplPtr(); + auto Context = detail::createSyclObjFromImpl(ContextImpl); + auto DeviceImpl = Queue->getDeviceImplPtr(); + auto Device = detail::createSyclObjFromImpl(DeviceImpl); + RT::PiProgram Program = + detail::ProgramManager::getInstance().createPIProgram(*DeviceImage, + Context, Device); + return {DeviceImage, Program}; + } + + const RTDeviceBinaryImage *DeviceImage = nullptr; + RT::PiProgram Program = nullptr; + if (KernelCG->getKernelBundle() != nullptr) { + // Retrieve the device image from the kernel bundle. + auto KernelBundle = KernelCG->getKernelBundle(); + kernel_id KernelID = + detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + + auto SyclKernel = detail::getSyclObjImpl( + KernelBundle->get_kernel(KernelID, KernelBundle)); + + DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref(); + Program = SyclKernel->getDeviceImage()->get_program_ref(); + } else if (KernelCG->MSyclKernel != nullptr) { + DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref(); + Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref(); + } else { + auto ContextImpl = Queue->getContextImplPtr(); + auto Context = detail::createSyclObjFromImpl(ContextImpl); + auto DeviceImpl = Queue->getDeviceImplPtr(); + auto Device = detail::createSyclObjFromImpl(DeviceImpl); + DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage( + KernelCG->MOSModuleHandle, KernelName, Context, Device); + Program = detail::ProgramManager::getInstance().createPIProgram( + *DeviceImage, Context, Device); + } + return {DeviceImage, Program}; +} + static ::jit_compiler::ParameterKind translateArgType(kernel_param_kind_t Kind) { using PK = ::jit_compiler::ParameterKind; @@ -576,43 +640,20 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, "Cannot fuse kernel with invalid kernel function name"); return nullptr; } - const RTDeviceBinaryImage *DeviceImage = nullptr; - RT::PiProgram Program = nullptr; + + auto [DeviceImage, Program] = retrieveKernelBinary(Queue, KernelCG); + + if (!DeviceImage || !Program) { + printPerformanceWarning("No suitable IR available for fusion"); + return nullptr; + } const KernelArgMask *EliminatedArgs = nullptr; - if (KernelCG->getKernelBundle() != nullptr) { - // Retrieve the device image from the kernel bundle. - auto KernelBundle = KernelCG->getKernelBundle(); - kernel_id KernelID = - detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); - - auto SyclKernel = detail::getSyclObjImpl( - KernelBundle->get_kernel(KernelID, KernelBundle)); - - DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref(); - Program = SyclKernel->getDeviceImage()->get_program_ref(); - EliminatedArgs = SyclKernel->getKernelArgMask(); - } else if (KernelCG->MSyclKernel != nullptr) { - DeviceImage = - KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref(); - Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref(); - EliminatedArgs = KernelCG->MSyclKernel->getKernelArgMask(); - } else { - auto ContextImpl = Queue->getContextImplPtr(); - auto Context = detail::createSyclObjFromImpl(ContextImpl); - auto DeviceImpl = Queue->getDeviceImplPtr(); - auto Device = detail::createSyclObjFromImpl(DeviceImpl); - DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage( - KernelCG->MOSModuleHandle, KernelName, Context, Device); - Program = detail::ProgramManager::getInstance().createPIProgram( - *DeviceImage, Context, Device); + if (Program && (KernelCG->MSyclKernel == nullptr || + !KernelCG->MSyclKernel->isCreatedFromSource())) { EliminatedArgs = detail::ProgramManager::getInstance().getEliminatedKernelArgMask( KernelCG->MOSModuleHandle, Program, KernelName); } - if (!DeviceImage || !Program) { - printPerformanceWarning("No suitable IR available for fusion"); - return nullptr; - } // Collect information about the arguments of this kernel. @@ -666,8 +707,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, return nullptr; } ::jit_compiler::SYCLKernelBinaryInfo BinInfo{ - translateBinaryImageFormat(DeviceImage->getFormat()), 0, - RawDeviceImage.BinaryStart, DeviceImageSize}; + BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize}; constexpr auto SYCLTypeToIndices = [](auto Val) -> ::jit_compiler::Indices { return {Val.get(0), Val.get(1), Val.get(2)}; From 5d1d778b1bb68593c67b7e17ec2c945ea21651b0 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 11 Jan 2023 14:02:59 +0000 Subject: [PATCH 03/25] [SYCL][Fusion] Add result translation to PTX Signed-off-by: Lukas Sommer --- sycl-fusion/common/include/Kernel.h | 2 +- sycl-fusion/common/lib/KernelIO.h | 1 + sycl-fusion/jit-compiler/CMakeLists.txt | 6 +- sycl-fusion/jit-compiler/include/JITContext.h | 16 ++- sycl-fusion/jit-compiler/include/Options.h | 6 +- sycl-fusion/jit-compiler/lib/JITContext.cpp | 14 +- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 18 +-- ...{LoadKernels.cpp => KernelTranslation.cpp} | 121 ++++++++++++++++-- .../{LoadKernels.h => KernelTranslation.h} | 14 +- .../lib/translation/SPIRVLLVMTranslation.cpp | 4 +- .../lib/translation/SPIRVLLVMTranslation.h | 2 +- sycl/source/detail/jit_compiler.cpp | 17 +++ 12 files changed, 186 insertions(+), 35 deletions(-) rename sycl-fusion/jit-compiler/lib/translation/{LoadKernels.cpp => KernelTranslation.cpp} (50%) rename sycl-fusion/jit-compiler/lib/translation/{LoadKernels.h => KernelTranslation.h} (64%) diff --git a/sycl-fusion/common/include/Kernel.h b/sycl-fusion/common/include/Kernel.h index 1962dd042ffbe..87726b3368d63 100644 --- a/sycl-fusion/common/include/Kernel.h +++ b/sycl-fusion/common/include/Kernel.h @@ -34,7 +34,7 @@ enum class ParameterKind : uint32_t { }; /// Different binary formats supported as input to the JIT compiler. -enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV }; +enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX }; /// Information about a device intermediate representation module (e.g., SPIR-V, /// LLVM IR) from DPC++. diff --git a/sycl-fusion/common/lib/KernelIO.h b/sycl-fusion/common/lib/KernelIO.h index 12c194f8b4dd4..09058d61e9981 100644 --- a/sycl-fusion/common/lib/KernelIO.h +++ b/sycl-fusion/common/lib/KernelIO.h @@ -47,6 +47,7 @@ template <> struct ScalarEnumerationTraits { static void enumeration(IO &IO, jit_compiler::BinaryFormat &BF) { IO.enumCase(BF, "LLVM", jit_compiler::BinaryFormat::LLVM); IO.enumCase(BF, "SPIRV", jit_compiler::BinaryFormat::SPIRV); + IO.enumCase(BF, "PTX", jit_compiler::BinaryFormat::PTX); IO.enumCase(BF, "INVALID", jit_compiler::BinaryFormat::INVALID); } }; diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index 92f0cefd68634..f67eed5fc6c76 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -2,7 +2,7 @@ add_llvm_library(sycl-fusion lib/KernelFusion.cpp lib/JITContext.cpp - lib/translation/LoadKernels.cpp + lib/translation/KernelTranslation.cpp lib/translation/SPIRVLLVMTranslation.cpp lib/fusion/FusionPipeline.cpp lib/fusion/FusionHelper.cpp @@ -20,6 +20,10 @@ add_llvm_library(sycl-fusion Linker ScalarOpts InstCombine + Target + NVPTX + X86 + MC ) target_include_directories(sycl-fusion diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h index 4c0616e267941..d4654f19820ce 100644 --- a/sycl-fusion/jit-compiler/include/JITContext.h +++ b/sycl-fusion/jit-compiler/include/JITContext.h @@ -36,17 +36,21 @@ using CacheKeyT = std::optional>>; /// -/// Wrapper around a SPIR-V binary. -class SPIRVBinary { +/// Wrapper around a kernel binary. +class KernelBinary { public: - explicit SPIRVBinary(std::string Binary); + explicit KernelBinary(std::string Binary, BinaryFormat Format); jit_compiler::BinaryAddress address() const; size_t size() const; + BinaryFormat format() const; + private: std::string Blob; + + BinaryFormat Format; }; /// @@ -61,7 +65,8 @@ class JITContext { llvm::LLVMContext *getLLVMContext(); - SPIRVBinary &emplaceSPIRVBinary(std::string Binary); + KernelBinary &emplaceSPIRVBinary(std::string Binary, + BinaryFormat Format); std::optional getCacheEntry(CacheKeyT &Identifier) const; @@ -79,11 +84,12 @@ class JITContext { MutexT BinariesMutex; - std::vector Binaries; + std::vector Binaries; mutable MutexT CacheMutex; std::unordered_map Cache; + }; } // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/include/Options.h b/sycl-fusion/jit-compiler/include/Options.h index 335f58fb64cf7..4fe7787df00db 100644 --- a/sycl-fusion/jit-compiler/include/Options.h +++ b/sycl-fusion/jit-compiler/include/Options.h @@ -9,12 +9,13 @@ #ifndef SYCL_FUSION_JIT_COMPILER_OPTIONS_H #define SYCL_FUSION_JIT_COMPILER_OPTIONS_H +#include "Kernel.h" #include #include namespace jit_compiler { -enum OptionID { VerboseOutput, EnableCaching }; +enum OptionID { VerboseOutput, EnableCaching, TargetFormat }; class OptionPtrBase {}; @@ -78,6 +79,9 @@ struct JITEnableVerbose : public OptionBase {}; struct JITEnableCaching : public OptionBase {}; +struct JITTargetFormat + : public OptionBase {}; + } // namespace option } // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/lib/JITContext.cpp b/sycl-fusion/jit-compiler/lib/JITContext.cpp index 68c7031b9d8a9..e1dda8b928c45 100644 --- a/sycl-fusion/jit-compiler/lib/JITContext.cpp +++ b/sycl-fusion/jit-compiler/lib/JITContext.cpp @@ -11,14 +11,17 @@ using namespace jit_compiler; -SPIRVBinary::SPIRVBinary(std::string Binary) : Blob{std::move(Binary)} {} +KernelBinary::KernelBinary(std::string Binary, BinaryFormat Fmt) + : Blob{std::move(Binary)}, Format{Fmt} {} -jit_compiler::BinaryAddress SPIRVBinary::address() const { +jit_compiler::BinaryAddress KernelBinary::address() const { // FIXME: Verify it's a good idea to perform this reinterpret_cast here. return reinterpret_cast(Blob.c_str()); } -size_t SPIRVBinary::size() const { return Blob.size(); } +size_t KernelBinary::size() const { return Blob.size(); } + +BinaryFormat KernelBinary::format() const { return Format; } JITContext::JITContext() : LLVMCtx{new llvm::LLVMContext}, Binaries{} {} @@ -26,11 +29,12 @@ JITContext::~JITContext() = default; llvm::LLVMContext *JITContext::getLLVMContext() { return LLVMCtx.get(); } -SPIRVBinary &JITContext::emplaceSPIRVBinary(std::string Binary) { +KernelBinary &JITContext::emplaceSPIRVBinary(std::string Binary, + BinaryFormat Format) { WriteLockT WriteLock{BinariesMutex}; // NOTE: With C++17, which returns a reference from emplace_back, the // following code would be even simpler. - Binaries.emplace_back(std::move(Binary)); + Binaries.emplace_back(std::move(Binary), Format); return Binaries.back(); } diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index fc168587738b5..eeffb0af6ced8 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -15,7 +15,7 @@ #include "fusion/FusionPipeline.h" #include "helper/ConfigHelper.h" #include "helper/ErrorHandling.h" -#include "translation/LoadKernels.h" +#include "translation/KernelTranslation.h" #include "translation/SPIRVLLVMTranslation.h" #include #include @@ -98,8 +98,8 @@ FusionResult KernelFusion::fuseKernels( // Load all input kernels from their respective SPIR-V modules into a single // LLVM IR module. llvm::Expected> ModOrError = - translation::KernelLoader::loadKernels(*JITCtx.getLLVMContext(), - ModuleInfo.kernels()); + translation::KernelTranslator::loadKernels(*JITCtx.getLLVMContext(), + ModuleInfo.kernels()); if (auto Error = ModOrError.takeError()) { return errorToFusionResult(std::move(Error), "SPIR-V translation failed"); } @@ -137,14 +137,14 @@ FusionResult KernelFusion::fuseKernels( SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName); - // Translate the LLVM IR module resulting from the fusion pass into SPIR-V. - llvm::Expected BinaryOrError = - translation::SPIRVLLVMTranslator::translateLLVMtoSPIRV(*NewMod, JITCtx); - if (auto Error = BinaryOrError.takeError()) { + // TODO + BinaryFormat TargetFormat = ConfigHelper::get(); + + if (auto Error = translation::KernelTranslator::translateKernel( + FusedKernelInfo, *NewMod, JITCtx, TargetFormat)) { return errorToFusionResult(std::move(Error), - "Translation to SPIR-V failed"); + "Translation to output format failed"); } - jit_compiler::SPIRVBinary *SPIRVBin = *BinaryOrError; FusedKernelInfo.NDR = FusedKernel.FusedNDRange; diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp similarity index 50% rename from sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp rename to sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index cabf1f6c44e9f..31fa850d58d63 100644 --- a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -1,4 +1,4 @@ -//==-------------------------- LoadKernels.cpp ----------------------------==// +//==----------------------- KernelTranslation.cpp -------------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,19 +6,24 @@ // //===----------------------------------------------------------------------===// -#include "LoadKernels.h" +#include "KernelTranslation.h" #include "SPIRVLLVMTranslation.h" #include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Linker/Linker.h" +#include "llvm/MC/TargetRegistry.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" using namespace jit_compiler; using namespace jit_compiler::translation; using namespace llvm; llvm::Expected> -KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx, - std::vector &Kernels) { +KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx, + std::vector &Kernels) { std::unique_ptr Result{nullptr}; bool First = true; DenseSet ParsedBinaries; @@ -100,8 +105,8 @@ KernelLoader::loadKernels(llvm::LLVMContext &LLVMCtx, } llvm::Expected> -KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, - SYCLKernelInfo &Kernel) { +KernelTranslator::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { auto &BinInfo = Kernel.BinaryInfo; llvm::StringRef RawData(reinterpret_cast(BinInfo.BinaryStart), BinInfo.BinarySize); @@ -110,7 +115,107 @@ KernelLoader::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, } llvm::Expected> -KernelLoader::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, - SYCLKernelInfo &Kernel) { +KernelTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, + SYCLKernelInfo &Kernel) { return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel); } + +llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel, + llvm::Module &Mod, + JITContext &JITCtx, + BinaryFormat Format) { + + KernelBinary *KernelBin = nullptr; + switch (Format) { + case BinaryFormat::SPIRV: { + llvm::Expected BinaryOrError = + translateToSPIRV(Mod, JITCtx); + if (auto Error = BinaryOrError.takeError()) { + return Error; + } + KernelBin = *BinaryOrError; + break; + } + case BinaryFormat::PTX: { + llvm::Expected BinaryOrError = translateToPTX(Mod, JITCtx); + if (auto Error = BinaryOrError.takeError()) { + return Error; + } + KernelBin = *BinaryOrError; + break; + } + default: { + return createStringError( + inconvertibleErrorCode(), + "Failed to translate kernel to unsupported output format"); + } + } + + // Update the KernelInfo for the fused kernel with the address and size of the + // SPIR-V binary resulting from translation. + SYCLKernelBinaryInfo &FusedBinaryInfo = Kernel.BinaryInfo; + FusedBinaryInfo.Format = Format; + // Output SPIR-V should use the same number of address bits as the input + // SPIR-V. SPIR-V translation requires all modules to use the same number of + // address bits, so it's safe to take the value from the first one. + FusedBinaryInfo.AddressBits = Mod.getDataLayout().getPointerSizeInBits(); + FusedBinaryInfo.BinaryStart = KernelBin->address(); + FusedBinaryInfo.BinarySize = KernelBin->size(); + return Error::success(); +} + +llvm::Expected +KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) { + return SPIRVLLVMTranslator::translateLLVMtoSPIRV(Mod, JITCtx); +} + +llvm::Expected +KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { + // FIXME: Can we limit this to the NVPTX specific target? + llvm::InitializeAllTargets(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); + llvm::InitializeAllTargetMCs(); + + std::string TargetTriple{"nvptx64-nvidia-cuda"}; + + std::string ErrorMessage; + const auto *Target = + llvm::TargetRegistry::lookupTarget(TargetTriple, ErrorMessage); + + if (!Target) { + return createStringError( + inconvertibleErrorCode(), + "Failed to load and translate SPIR-V module with error %s", + ErrorMessage.c_str()); + } + + // FIXME: Check whether we can provide more accurate target information here + auto *TargetMachine = Target->createTargetMachine( + TargetTriple, "sm_50", "+sm_50,+ptx76", {}, llvm::Reloc::PIC_, + std::nullopt, llvm::CodeGenOpt::Default); + + llvm::legacy::PassManager PM; + + std::string PTXASM; + + { + llvm::raw_string_ostream ASMStream{PTXASM}; + llvm::buffer_ostream BufferedASM{ASMStream}; + + if (TargetMachine->addPassesToEmitFile(PM, BufferedASM, nullptr, + llvm::CGFT_AssemblyFile)) { + return createStringError( + inconvertibleErrorCode(), + "Failed to construct pass pipeline to emit output"); + } + + PM.run(Mod); + ASMStream.flush(); + } + + llvm::dbgs() << "PTX size: " << PTXASM.size() << "\n"; + llvm::dbgs() << "PTX:\n" << PTXASM << "\n"; + + return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX); +} diff --git a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h similarity index 64% rename from sycl-fusion/jit-compiler/lib/translation/LoadKernels.h rename to sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h index 5720abf09bfd7..3d6824edbdd37 100644 --- a/sycl-fusion/jit-compiler/lib/translation/LoadKernels.h +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h @@ -1,4 +1,4 @@ -//==-- LoadKernels.h - Load LLVM IR for SYCL kernels in different formats -==// +//==- KernelTranslation - Translate SYCL kernels between different formats -==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "JITContext.h" #include "Kernel.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" @@ -15,12 +16,15 @@ namespace jit_compiler { namespace translation { -class KernelLoader { +class KernelTranslator { public: static llvm::Expected> loadKernels(llvm::LLVMContext &LLVMCtx, std::vector &Kernels); + static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod, + JITContext &JITCtx, BinaryFormat Format); + private: /// /// Pair of address and size to represent a binary blob. @@ -31,6 +35,12 @@ class KernelLoader { static llvm::Expected> loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, SYCLKernelInfo &Kernel); + + static llvm::Expected translateToSPIRV(llvm::Module &Mod, + JITContext &JITCtx); + + static llvm::Expected translateToPTX(llvm::Module &Mod, + JITContext &JITCtx); }; } // namespace translation } // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp index fc481f23b141d..1046fd6dc2907 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp @@ -120,7 +120,7 @@ SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, return std::move(NewMod); } -Expected +Expected SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) { std::ostringstream BinaryStream; std::string ErrMsg; @@ -131,5 +131,5 @@ SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) { "Translation of LLVM IR to SPIR-V failed with error %s", ErrMsg.c_str()); } - return &JITCtx.emplaceSPIRVBinary(BinaryStream.str()); + return &JITCtx.emplaceSPIRVBinary(BinaryStream.str(), BinaryFormat::SPIRV); } diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h index d82a9cc82466c..440c00103b0d5 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h @@ -30,7 +30,7 @@ class SPIRVLLVMTranslator { /// /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and /// return a pointer to its container. - static llvm::Expected translateLLVMtoSPIRV(llvm::Module &Mod, + static llvm::Expected translateLLVMtoSPIRV(llvm::Module &Mod, JITContext &JITCtx); private: diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index d8a59a58f8a36..f34f0ac682f7b 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -39,6 +39,21 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) { } } +::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) { + auto Backend = Queue->getDeviceImplPtr()->getPlugin().getBackend(); + switch (Backend) { + case backend::ext_oneapi_level_zero: + case backend::opencl: + return ::jit_compiler::BinaryFormat::SPIRV; + case backend::ext_oneapi_cuda: + return ::jit_compiler::BinaryFormat::PTX; + default: + throw sycl::exception( + sycl::make_error_code(sycl::errc::feature_not_supported), + "Backend unsupported by kernel fusion"); + } +} + std::pair retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { auto KernelName = KernelCG->getKernelName(); @@ -796,6 +811,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled); JITConfig.set<::jit_compiler::option::JITEnableCaching>( detail::SYCLConfig::get()); + JITConfig.set<::jit_compiler::option::JITTargetFormat>( + getTargetFormat(Queue)); auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels( *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames, From d32c7586cb014d2191440f2a3d050b6cc9205e8b Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Thu, 12 Jan 2023 08:56:43 +0000 Subject: [PATCH 04/25] [SYCL][Fusion] Provide correct target spec; Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 12 ------ sycl/source/detail/jit_compiler.cpp | 38 ++++++++++++++----- sycl/source/detail/jit_compiler.hpp | 3 +- sycl/source/detail/jit_device_binaries.cpp | 10 ++--- sycl/source/detail/jit_device_binaries.hpp | 4 +- 5 files changed, 37 insertions(+), 30 deletions(-) diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index eeffb0af6ced8..6c14ac182514a 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -148,18 +148,6 @@ FusionResult KernelFusion::fuseKernels( FusedKernelInfo.NDR = FusedKernel.FusedNDRange; - // Update the KernelInfo for the fused kernel with the address and size of the - // SPIR-V binary resulting from translation. - SYCLKernelBinaryInfo &FusedBinaryInfo = FusedKernelInfo.BinaryInfo; - FusedBinaryInfo.Format = BinaryFormat::SPIRV; - // Output SPIR-V should use the same number of address bits as the input - // SPIR-V. SPIR-V translation requires all modules to use the same number of - // address bits, so it's safe to take the value from the first one. - FusedBinaryInfo.AddressBits = - ModuleInfo.kernels().front().BinaryInfo.AddressBits; - FusedBinaryInfo.BinaryStart = SPIRVBin->address(); - FusedBinaryInfo.BinarySize = SPIRVBin->size(); - if (CachingEnabled) { JITCtx.addCacheEntry(CacheKey, FusedKernelInfo); } diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index f34f0ac682f7b..40e8f946135e3 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -811,8 +811,9 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled); JITConfig.set<::jit_compiler::option::JITEnableCaching>( detail::SYCLConfig::get()); - JITConfig.set<::jit_compiler::option::JITTargetFormat>( - getTargetFormat(Queue)); + + ::jit_compiler::BinaryFormat TargetFormat = getTargetFormat(Queue); + JITConfig.set<::jit_compiler::option::JITTargetFormat>(TargetFormat); auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels( *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames, @@ -854,7 +855,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage); if (!FusionResult.cached()) { - auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo); + auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat); detail::ProgramManager::getInstance().addImages(PIDeviceBinaries); } else if (DebugEnabled) { std::cerr << "INFO: Re-using existing device binary for fused kernel\n"; @@ -866,9 +867,11 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, FusedKernelInfo.Name); std::vector> RawExtendedMembers; - std::shared_ptr KernelBundleImplPtr = - detail::getSyclObjImpl(get_kernel_bundle( - Queue->get_context(), {Queue->get_device()}, {FusedKernelId})); + std::shared_ptr KernelBundleImplPtr; + if (TargetFormat == ::jit_compiler::BinaryFormat::SPIRV) { + detail::getSyclObjImpl(get_kernel_bundle( + Queue->get_context(), {Queue->get_device()}, {FusedKernelId})); + } std::unique_ptr FusedCG; FusedCG.reset(new detail::CGExecKernel( @@ -881,7 +884,25 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, } pi_device_binaries jit_compiler::createPIDeviceBinary( - const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) { + const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, + ::jit_compiler::BinaryFormat Format) { + + const char *TargetSpec = nullptr; + switch (Format) { + case ::jit_compiler::BinaryFormat::PTX: { + TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64; + break; + } + case ::jit_compiler::BinaryFormat::SPIRV: { + TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64) + ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64 + : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32; + break; + } + default: + sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Invalid output format"); + } DeviceBinaryContainer Binary; @@ -909,8 +930,7 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( DeviceBinariesCollection Collection; Collection.addDeviceBinary(std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart, - FusedKernelInfo.BinaryInfo.BinarySize, - FusedKernelInfo.BinaryInfo.AddressBits); + FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec); JITDeviceBinaries.push_back(std::move(Collection)); return JITDeviceBinaries.back().getPIDeviceStruct(); diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index 522c0749ef75b..e02be562de3ee 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -13,6 +13,7 @@ #include namespace jit_compiler { +enum class BinaryFormat; class JITContext; struct SYCLKernelInfo; using ArgUsageMask = std::vector; @@ -46,7 +47,7 @@ class jit_compiler { jit_compiler &operator=(const jit_compiler &&) = delete; pi_device_binaries - createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo); + createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, ::jit_compiler::BinaryFormat Format); std::vector encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const; diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp index 0aa778da14240..eadd71d021ee8 100644 --- a/sycl/source/detail/jit_device_binaries.cpp +++ b/sycl/source/detail/jit_device_binaries.cpp @@ -81,7 +81,7 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) { } pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( - const unsigned char *BinaryStart, size_t BinarySize, size_t AddressBits) { + const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec) { pi_device_binary_struct DeviceBinary; DeviceBinary.Version = PI_DEVICE_BINARY_VERSION; DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL; @@ -94,9 +94,7 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( DeviceBinary.BinaryStart = BinaryStart; DeviceBinary.BinaryEnd = BinaryStart + BinarySize; DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV; - DeviceBinary.DeviceTargetSpec = (AddressBits == 32) - ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32 - : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64; + DeviceBinary.DeviceTargetSpec = TargetSpec; DeviceBinary.EntriesBegin = PIOffloadEntries.data(); DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size(); DeviceBinary.PropertySetsBegin = PIPropertySets.data(); @@ -108,14 +106,14 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - size_t AddressBits) { + const char* TargetSpec) { // Adding to the vectors might trigger reallocation, which would invalidate // the pointers used for PI structs if a PI struct has already been created // via getPIDeviceStruct(). Forbid calls to this method after the first PI // struct has been created. assert(Fused && "Adding to container would invalidate existing PI structs"); PIBinaries.push_back( - Cont.getPIDeviceBinary(BinaryStart, BinarySize, AddressBits)); + Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec)); Binaries.push_back(std::move(Cont)); } diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp index 7bf2c7d9fe07b..6fa142543b61f 100644 --- a/sycl/source/detail/jit_device_binaries.hpp +++ b/sycl/source/detail/jit_device_binaries.hpp @@ -111,7 +111,7 @@ class DeviceBinaryContainer { pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart, size_t BinarySize, - size_t AddressBits); + const char* TargetSpec); private: bool Fused = true; @@ -138,7 +138,7 @@ class DeviceBinariesCollection { void addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - size_t AddressBits); + const char* TargetSpec); pi_device_binaries getPIDeviceStruct(); private: From b99478118db2a3fb3f00f0d24358fb7d7843eb11 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Fri, 13 Jan 2023 13:16:28 +0000 Subject: [PATCH 05/25] [SYCL][Fusion] Avoid removing dependencies Avoid the dependencies of the dependencies to be removed when cleaning up the input commands from the graph without executing them. Signed-off-by: Lukas Sommer --- sycl/source/detail/scheduler/commands.hpp | 8 ++++++++ sycl/source/detail/scheduler/graph_builder.cpp | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 35df6ef614cc7..a019ab9378226 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -289,6 +289,14 @@ class Command { // XPTI instrumentation. Copy code location details to the internal struct. // Memory is allocated in this method and released in destructor. void copySubmissionCodeLocation(); + + /// Clear all dependency events for device and host dependencies. This should + /// only be used if a command is about to be deleted without being executed + /// before that. + void clearAllDependencies() { + MPreparedDepsEvents.clear(); + MPreparedHostDepsEvents.clear(); + } /// Contains list of dependencies(edges) std::vector MDeps; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 5dee68c6e69d2..a404fe707698b 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1396,6 +1396,10 @@ void Scheduler::GraphBuilder::removeNodeFromGraph( } Node->MDeps.clear(); + // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called + // from the destructor of the command to delete the dependencies of the + // command this command depends on. + Node->clearAllDependencies(); } void Scheduler::GraphBuilder::cancelFusion(QueueImplPtr Queue, From 651847eb18cc96cdc7f859c7d6bec282940be248 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Fri, 13 Jan 2023 13:23:14 +0000 Subject: [PATCH 06/25] [SYCL][Fusion] Set device binary image format Signed-off-by: Lukas Sommer --- sycl/source/detail/jit_compiler.cpp | 18 ++++++++++++------ sycl/source/detail/jit_device_binaries.cpp | 8 ++++---- sycl/source/detail/jit_device_binaries.hpp | 4 ++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 40e8f946135e3..04a09ee84a72e 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -811,7 +811,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled); JITConfig.set<::jit_compiler::option::JITEnableCaching>( detail::SYCLConfig::get()); - + ::jit_compiler::BinaryFormat TargetFormat = getTargetFormat(Queue); JITConfig.set<::jit_compiler::option::JITTargetFormat>(TargetFormat); @@ -854,10 +854,13 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, }(FusedKernelInfo.NDR); updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage); + OSModuleHandle Handle = OSUtil::DummyModuleHandle; if (!FusionResult.cached()) { auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat); detail::ProgramManager::getInstance().addImages(PIDeviceBinaries); + Handle = OSUtil::getOSModuleHandle(PIDeviceBinaries->DeviceBinaries); } else if (DebugEnabled) { + // TODO(Lukas): Create correct OSModuleHandle when using a cached binary. std::cerr << "INFO: Re-using existing device binary for fused kernel\n"; } @@ -878,8 +881,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr), std::move(ArgsStorage), std::move(AccStorage), std::move(RawExtendedMembers), std::move(Requirements), std::move(Events), - std::move(FusedArgs), FusedKernelInfo.Name, OSUtil::DummyModuleHandle, {}, - {}, CG::CGTYPE::Kernel, KernelCacheConfig)); + std::move(FusedArgs), FusedKernelInfo.Name, Handle, {}, {}, + CG::CGTYPE::Kernel, KernelCacheConfig)); return FusedCG; } @@ -888,15 +891,18 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( ::jit_compiler::BinaryFormat Format) { const char *TargetSpec = nullptr; + pi_device_binary_type BinFormat = PI_DEVICE_BINARY_TYPE_NATIVE; switch (Format) { case ::jit_compiler::BinaryFormat::PTX: { TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64; + BinFormat = PI_DEVICE_BINARY_TYPE_NONE; break; } case ::jit_compiler::BinaryFormat::SPIRV: { TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64) ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64 : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32; + BinFormat = PI_DEVICE_BINARY_TYPE_SPIRV; break; } default: @@ -928,9 +934,9 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( Binary.addProperty(std::move(ArgMaskPropSet)); DeviceBinariesCollection Collection; - Collection.addDeviceBinary(std::move(Binary), - FusedKernelInfo.BinaryInfo.BinaryStart, - FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec); + Collection.addDeviceBinary( + std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart, + FusedKernelInfo.BinaryInfo.BinarySize, TargetSpec, BinFormat); JITDeviceBinaries.push_back(std::move(Collection)); return JITDeviceBinaries.back().getPIDeviceStruct(); diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp index eadd71d021ee8..59530a0e691ce 100644 --- a/sycl/source/detail/jit_device_binaries.cpp +++ b/sycl/source/detail/jit_device_binaries.cpp @@ -81,10 +81,11 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) { } pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( - const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec) { + const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec, pi_device_binary_type Format) { pi_device_binary_struct DeviceBinary; DeviceBinary.Version = PI_DEVICE_BINARY_VERSION; DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL; + DeviceBinary.Format = Format; DeviceBinary.CompileOptions = ""; DeviceBinary.LinkOptions = ""; DeviceBinary.ManifestStart = nullptr; @@ -93,7 +94,6 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( // the JITContext. DeviceBinary.BinaryStart = BinaryStart; DeviceBinary.BinaryEnd = BinaryStart + BinarySize; - DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV; DeviceBinary.DeviceTargetSpec = TargetSpec; DeviceBinary.EntriesBegin = PIOffloadEntries.data(); DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size(); @@ -106,14 +106,14 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec) { + const char* TargetSpec, pi_device_binary_type Format) { // Adding to the vectors might trigger reallocation, which would invalidate // the pointers used for PI structs if a PI struct has already been created // via getPIDeviceStruct(). Forbid calls to this method after the first PI // struct has been created. assert(Fused && "Adding to container would invalidate existing PI structs"); PIBinaries.push_back( - Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec)); + Cont.getPIDeviceBinary(BinaryStart, BinarySize, TargetSpec, Format)); Binaries.push_back(std::move(Cont)); } diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp index 6fa142543b61f..96079d3a25a19 100644 --- a/sycl/source/detail/jit_device_binaries.hpp +++ b/sycl/source/detail/jit_device_binaries.hpp @@ -111,7 +111,7 @@ class DeviceBinaryContainer { pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec); + const char* TargetSpec, pi_device_binary_type Format); private: bool Fused = true; @@ -138,7 +138,7 @@ class DeviceBinariesCollection { void addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec); + const char* TargetSpec, pi_device_binary_type Format); pi_device_binaries getPIDeviceStruct(); private: From 7fddbd5908acfd36116c8ce412f402947fe4b42f Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Mon, 16 Jan 2023 14:28:36 +0000 Subject: [PATCH 07/25] [SYCL][Fusion] Refactor target-specific processing Signed-off-by: Lukas Sommer --- sycl-fusion/passes/CMakeLists.txt | 1 + .../passes/kernel-fusion/SYCLKernelFusion.cpp | 133 +++-------- .../passes/kernel-fusion/SYCLKernelFusion.h | 9 +- .../passes/target/TargetFusionInfo.cpp | 225 ++++++++++++++++++ sycl-fusion/passes/target/TargetFusionInfo.h | 137 +++++++++++ 5 files changed, 395 insertions(+), 110 deletions(-) create mode 100644 sycl-fusion/passes/target/TargetFusionInfo.cpp create mode 100644 sycl-fusion/passes/target/TargetFusionInfo.h diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt index fe81b76b5bbcb..41398f6ee9e99 100644 --- a/sycl-fusion/passes/CMakeLists.txt +++ b/sycl-fusion/passes/CMakeLists.txt @@ -35,6 +35,7 @@ add_llvm_library(SYCLKernelFusionPasses syclcp/SYCLCP.cpp cleanup/Cleanup.cpp debug/PassDebug.cpp + target/TargetFusionInfo.cpp DEPENDS intrinsics_gen diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp index 241496ac9044f..d3dec7f16a81f 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp @@ -31,12 +31,6 @@ using namespace llvm; -constexpr static StringLiteral KernelArgAddrSpace{"kernel_arg_addr_space"}; -constexpr static StringLiteral KernelArgAccessQual{"kernel_arg_access_qual"}; -constexpr static StringLiteral KernelArgType{"kernel_arg_type"}; -constexpr static StringLiteral KernelArgBaseType{"kernel_arg_base_type"}; -constexpr static StringLiteral KernelArgTypeQual{"kernel_arg_type_qual"}; - constexpr StringLiteral SYCLKernelFusion::NDRangeMDKey; constexpr StringLiteral SYCLKernelFusion::NDRangesMDKey; @@ -144,6 +138,8 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) { AM.getResult(M).ModuleInfo; assert(ModuleInfo && "No module information available"); + auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + // Iterate over the functions in the module and locate all // stub functions identified by metadata. SmallPtrSet ToCleanUp; @@ -156,7 +152,7 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) { // attached to this stub function. // The newly created function will carry the name also specified // in the metadata. - if (auto Err = fuseKernel(M, F, ModuleInfo, ToCleanUp)) { + if (auto Err = fuseKernel(M, F, ModuleInfo, TFI, ToCleanUp)) { DeferredErrs = joinErrors(std::move(DeferredErrs), std::move(Err)); } // Rembember the stub for deletion, as it is not required anymore after @@ -164,6 +160,10 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) { ToCleanUp.insert(&F); } } + // Notify the target-specific logic that some functions will be erased + // shortly. + SmallVector NotifyDelete{ToCleanUp.begin(), ToCleanUp.end()}; + TFI.notifyFunctionsDelete(NotifyDelete); // Delete all the stub functions for (Function *SF : ToCleanUp) { SF->eraseFromParent(); @@ -230,11 +230,13 @@ static FusionInsertPoints addGuard(IRBuilderBase &Builder, return {Entry, CallInsertion, Exit}; } -static Expected createFusionCall( - IRBuilderBase &Builder, Function *F, ArrayRef CallArgs, - const jit_compiler::NDRange &SrcNDRange, - const jit_compiler::NDRange &FusedNDRange, bool IsLast, int BarriersFlags, - jit_compiler::Remapper &Remapper, bool ShouldRemap) { +static Expected +createFusionCall(IRBuilderBase &Builder, Function *F, + ArrayRef CallArgs, + const jit_compiler::NDRange &SrcNDRange, + const jit_compiler::NDRange &FusedNDRange, bool IsLast, + int BarriersFlags, jit_compiler::Remapper &Remapper, + bool ShouldRemap, TargetFusionInfo &TargetInfo) { const auto IPs = addGuard(Builder, SrcNDRange, FusedNDRange, IsLast); if (ShouldRemap) { @@ -260,7 +262,7 @@ static Expected createFusionCall( // Insert barrier if needed if (!IsLast && BarriersFlags > 0) { - jit_compiler::barrierCall(Builder, BarriersFlags); + TargetInfo.createBarrierCall(Builder, BarriersFlags); } // Set insert point for future insertions @@ -271,6 +273,7 @@ static Expected createFusionCall( Error SYCLKernelFusion::fuseKernel( Module &M, Function &StubFunction, jit_compiler::SYCLModuleInfo *ModInfo, + TargetFusionInfo &TargetInfo, SmallPtrSetImpl &ToCleanUp) const { // Retrieve the metadata from the stub function. // The first operand of the tuple is the name that the newly created, @@ -343,12 +346,9 @@ Error SYCLKernelFusion::fuseKernel( SmallVector FusedArgNames; SmallVector FusedParamAttributes; // We must keep track of some metadata attached to each parameter. - // Collect it in lists, so it can be attached to the fused function later on. - MDList KernelArgAddressSpaces; - MDList KernelArgAccessQualifiers; - MDList KernelArgTypes; - MDList KernelArgBaseTypes; - MDList KernelArgTypeQualifiers; + // Collect it, so it can be attached to the fused function later on. + MetadataCollection MDCollection{TargetInfo.getKernelMetadataKeys()}; + // Add the information about the new kernel to the SYCLModuleInfo. // Initialize the jit_compiler::SYCLKernelInfo with the name. The remaining // information for functor & argument layout and attributes will be filled in @@ -425,14 +425,7 @@ Error SYCLKernelFusion::fuseKernel( // Add the metadata corresponding to the used arguments to the different // lists. NOTE: We do not collect the "kernel_arg_name" metadata, because // the kernel arguments receive new names in the fused kernel. - addToFusedMetadata(FF, KernelArgAddrSpace, UsedArgsMask, - KernelArgAddressSpaces); - addToFusedMetadata(FF, KernelArgAccessQual, UsedArgsMask, - KernelArgAccessQualifiers); - addToFusedMetadata(FF, KernelArgType, UsedArgsMask, KernelArgTypes); - addToFusedMetadata(FF, KernelArgBaseType, UsedArgsMask, KernelArgBaseTypes); - addToFusedMetadata(FF, KernelArgTypeQual, UsedArgsMask, - KernelArgTypeQualifiers); + MDCollection.collectFromFunction(FF, UsedArgsMask); // Update the fused kernel's KernelInfo with information from this input // kernel. @@ -502,26 +495,15 @@ Error SYCLKernelFusion::fuseKernel( AI.value().setName(ArgName); KernelArgNames.push_back(MDString::get(LLVMCtx, ArgName)); } - // Attach the fused kernel_arg_* metadata collected from the different input + // Attach the fused metadata collected from the different input // kernels to the fused function. - attachFusedMetadata(FusedFunction, "kernel_arg_addr_space", - KernelArgAddressSpaces); - attachFusedMetadata(FusedFunction, "kernel_arg_access_qual", - KernelArgAccessQualifiers); - attachFusedMetadata(FusedFunction, "kernel_arg_type", KernelArgTypes); - attachFusedMetadata(FusedFunction, "kernel_arg_base_type", - KernelArgBaseTypes); - attachFusedMetadata(FusedFunction, "kernel_arg_type_qual", - KernelArgTypeQualifiers); - attachFusedMetadata(FusedFunction, "kernel_arg_name", KernelArgNames); + MDCollection.attachToFunction(FusedFunction); // Add metadata for reqd_work_group_size and work_group_size_hint attachKernelAttributeMD(LLVMCtx, FusedFunction, FusedKernelInfo); - // The fused kernel should be a SPIR-V kernel again. - // NOTE: If this pass is used in a scenario where input and output - // of the compilation are not SPIR-V, care must be taken of other - // potential calling conventions here (e.g., nvptx). - FusedFunction->setCallingConv(CallingConv::SPIR_KERNEL); + // Mark the fused function as a kernel by calling TargetFusionInfo, because + // this is target-specific. + TargetInfo.addKernelFunction(FusedFunction); // Fusion is implemented as a two step process: In the first step, we // simply create calls to the functions that should be fused into this @@ -557,9 +539,9 @@ Error SYCLKernelFusion::fuseKernel( unsigned ParamIdx = ParamMapping[{FuncIndex, I}]; CallArgs.push_back(FusedFunction->getArg(ParamIdx)); } - auto CallOrErr = createFusionCall(Builder, IF, CallArgs, KF.ND, NDRange, - FuncIndex == BarriersEnd, BarriersFlags, - Remapper, IsHeterogeneousNDRangesList); + auto CallOrErr = createFusionCall( + Builder, IF, CallArgs, KF.ND, NDRange, FuncIndex == BarriersEnd, + BarriersFlags, Remapper, IsHeterogeneousNDRangesList, TargetInfo); // Add to the set of original kernel functions that can be deleted after // fusion is complete. ToCleanUp.insert(IF); @@ -602,46 +584,8 @@ Error SYCLKernelFusion::fuseKernel( } } - // Remove all existing calls of the ITT instrumentation functions. Insert new - // ones in the entry block of the fused kernel and every exit block if the - // functions are present in the module. - // We cannot use the existing SPIRITTAnnotations pass, because that pass might - // insert calls to functions not present in the module (e.g., ITT - // instrumentations for barriers). As the JITed module is not linked with - // libdevice anymore, the functions would remain unresolved and cause the - // driver to fail. - Function *StartWrapperFunc = M.getFunction(ITTStartWrapper); - Function *FinishWrapperFunc = M.getFunction(ITTFinishWrapper); - bool InsertWrappers = - ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) && - (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration())); - auto *WrapperFuncTy = - FunctionType::get(Type::getVoidTy(M.getContext()), /*isVarArg*/ false); - for (auto &BB : *FusedFunction) { - for (auto Inst = BB.begin(); Inst != BB.end();) { - if (auto *CB = dyn_cast(Inst)) { - if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) { - Inst = Inst->eraseFromParent(); - continue; - } - } - ++Inst; - } - if (InsertWrappers) { - if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { - auto *WrapperCall = - CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI); - WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); - } - } - } - if (InsertWrappers) { - FusedFunction->getEntryBlock().getFirstInsertionPt(); - auto *WrapperCall = CallInst::Create( - WrapperFuncTy, StartWrapperFunc, "", - &*FusedFunction->getEntryBlock().getFirstInsertionPt()); - WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); - } + // Perform target-specific post-processing of the new fused kernel. + TargetInfo.postProcessKernel(FusedFunction); return Error::success(); } @@ -717,23 +661,6 @@ static unsigned getUnsignedFromMD(Metadata *MD) { return ConstInt->getZExtValue(); } -void SYCLKernelFusion::addToFusedMetadata( - Function *InputFunction, const StringRef &Kind, - const ArrayRef IsArgPresentMask, - SmallVectorImpl &FusedMDList) const { - // Retrieve metadata from one of the input kernels and add it to the list - // of fused metadata. - assert(InputFunction->hasMetadata(Kind) && - "Required Metadata not present on input kernel"); - if (auto *MD = InputFunction->getMetadata(Kind)) { - for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) { - if (std::get<0>(MaskedOps)) { - FusedMDList.emplace_back(std::get<1>(MaskedOps).get()); - } - } - } -} - void SYCLKernelFusion::attachFusedMetadata( Function *FusedFunction, const StringRef &Kind, const ArrayRef FusedMetadata) const { diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h index 5e26595d2d343..5f52f0a317d14 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h @@ -10,6 +10,7 @@ #define SYCL_FUSION_PASSES_SYCLKERNELFUSION_H #include "Kernel.h" +#include "target/TargetFusionInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -52,8 +53,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin { // locate our own metadata again. static constexpr auto MetadataKind = "sycl.kernel.fused"; static constexpr auto ParameterMDKind = "sycl.kernel.param"; - static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; - static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; using MDList = llvm::SmallVector; @@ -112,6 +111,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin { llvm::Error fuseKernel(llvm::Module &M, llvm::Function &StubFunction, jit_compiler::SYCLModuleInfo *ModInfo, + llvm::TargetFusionInfo &TargetInfo, llvm::SmallPtrSetImpl &ToCleanUp) const; void canonicalizeParameters( @@ -119,11 +119,6 @@ class SYCLKernelFusion : public llvm::PassInfoMixin { Parameter getParamFromMD(llvm::Metadata *MD) const; - void addToFusedMetadata( - llvm::Function *InputFunction, const llvm::StringRef &Kind, - const llvm::ArrayRef IsArgPresentMask, - llvm::SmallVectorImpl &FusedMDList) const; - void attachFusedMetadata( llvm::Function *FusedFunction, const llvm::StringRef &Kind, const llvm::ArrayRef FusedMetadata) const; diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp new file mode 100644 index 0000000000000..191ab676613bb --- /dev/null +++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp @@ -0,0 +1,225 @@ +//==---------------------- TargetFusionInfo.cpp ----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TargetFusionInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsNVPTX.h" + +using namespace llvm; + +// +// TargetFusionInfo +// + +TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) { + llvm::Triple Tri(Mod->getTargetTriple()); + if (Tri.isNVPTX()) { + return TargetFusionInfo( + std::shared_ptr(new NVPTXTargetFusionInfo(Mod))); + } + if (Tri.isSPIRV()) { + return TargetFusionInfo( + std::shared_ptr(new SPIRVTargetFusionInfo(Mod))); + } + assert(false && "Unsupported target for fusion"); +} + +// +// SPIRVTargetFusionInfo +// + +void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) { + KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL); +} + +ArrayRef SPIRVTargetFusionInfo::getKernelMetadataKeys() { + static SmallVector Keys{ + {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type", + "kernel_arg_base_type", "kernel_arg_type_qual"}}; + return Keys; +} + +void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) { + static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; + static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; + // Remove all existing calls of the ITT instrumentation functions. Insert new + // ones in the entry block of the fused kernel and every exit block if the + // functions are present in the module. + // We cannot use the existing SPIRITTAnnotations pass, because that pass might + // insert calls to functions not present in the module (e.g., ITT + // instrumentations for barriers). As the JITed module is not linked with + // libdevice anymore, the functions would remain unresolved and cause the + // driver to fail. + Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper); + Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper); + bool InsertWrappers = + ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) && + (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration())); + auto *WrapperFuncTy = FunctionType::get( + Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false); + for (auto &BB : *KernelFunc) { + for (auto Inst = BB.begin(); Inst != BB.end();) { + if (auto *CB = dyn_cast(Inst)) { + if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) { + Inst = Inst->eraseFromParent(); + continue; + } + } + ++Inst; + } + if (InsertWrappers) { + if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { + auto *WrapperCall = + CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI); + WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); + } + } + } + if (InsertWrappers) { + KernelFunc->getEntryBlock().getFirstInsertionPt(); + auto *WrapperCall = + CallInst::Create(WrapperFuncTy, StartWrapperFunc, "", + &*KernelFunc->getEntryBlock().getFirstInsertionPt()); + WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); + } +} + +void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) { + if (BarrierFlags == -1) { + return; + } + assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) && + "Invalid barrier flags"); + + static const auto FnAttrs = AttributeSet::get( + LLVMMod->getContext(), + {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent), + Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)}); + + static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"}; + + Function *F = LLVMMod->getFunction(N); + if (!F) { + constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage; + + auto *Ty = FunctionType::get( + Builder.getVoidTy(), + {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()}, + false /* isVarArg*/); + + F = Function::Create(Ty, Linkage, N, *LLVMMod); + + F->setAttributes( + AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); + F->setCallingConv(CallingConv::SPIR_FUNC); + } + + // See + // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- + SmallVector Args{ + Builder.getInt32(/*Exec Scope : Workgroup = */ 2), + Builder.getInt32(/*Exec Scope : Workgroup = */ 2), + Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) | + ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))}; + + auto *BarrierCallInst = Builder.CreateCall(F, Args); + BarrierCallInst->setAttributes( + AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); + BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC); +} + +// +// NVPTXTargetFusionInfo +// + +void NVPTXTargetFusionInfo::notifyFunctionsDelete( + llvm::ArrayRef Funcs) { + SmallPtrSet DeletedFuncs{Funcs.begin(), Funcs.end()}; + SmallVector ValidKernels; + auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations"); + for (auto *Op : OldAnnotations->operands()) { + if (auto *TOp = dyn_cast(Op)) { + if (auto *COp = dyn_cast_if_present( + TOp->getOperand(0).get())) { + if (!DeletedFuncs.contains(COp->getValue())) { + ValidKernels.push_back(Op); + // Add to the set to also remove duplicate entries. + DeletedFuncs.insert(COp->getValue()); + } + } + } + } + LLVMMod->eraseNamedMetadata(OldAnnotations); + auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + for (auto *Kernel : ValidKernels) { + NewAnnotations->addOperand(Kernel); + } +} + +void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) { + auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + auto *MDOne = ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1)); + auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel"); + auto *MDFunc = ConstantAsMetadata::get(KernelFunc); + SmallVector KernelMD({MDFunc, MDKernelString, MDOne}); + auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD); + NVVMAnnotations->addOperand(Tuple); +} + +ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() { + // FIXME: Check whether we need to take care of sycl_fixed_targets. + static SmallVector Keys{{"kernel_arg_buffer_location", + "kernel_arg_runtime_aligned", + "kernel_arg_exclusive_ptr"}}; + return Keys; +} + +void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) { + if (BarrierFlags == -1) { + return; + } + // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX + // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0 + // instruction, equivalent to the __syncthreads() call in CUDA." + Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {}); +} + +// +// MetadataCollection +// + +MetadataCollection::MetadataCollection(ArrayRef MDKeys) + : Keys{MDKeys}, Collection(MDKeys.size()) {} + +void MetadataCollection::collectFromFunction( + llvm::Function *Func, const ArrayRef IsArgPresentMask) { + for (auto &Key : Keys) { + // TODO: Do we want to assert for the presence of the metadata here? + if (auto *MD = Func->getMetadata(Key)) { + for (auto MaskedOps : llvm::zip(IsArgPresentMask, MD->operands())) { + if (std::get<0>(MaskedOps)) { + Collection[Key].emplace_back(std::get<1>(MaskedOps).get()); + } + } + } + } +} + +void MetadataCollection::attachToFunction(llvm::Function *Func) { + for (auto &Key : Keys) { + // Attach a list of fused metadata for a kind to the fused function. + auto *MDEntries = MDNode::get(Func->getContext(), Collection[Key]); + Func->setMetadata(Key, MDEntries); + } +} diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h new file mode 100644 index 0000000000000..653af4904e2ae --- /dev/null +++ b/sycl-fusion/passes/target/TargetFusionInfo.h @@ -0,0 +1,137 @@ +//==-- TargetFusionInfo.h - Encapsule target-specific fusion functionality -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" + +namespace llvm { + +class TargetFusionInfoImpl { + +public: + virtual ~TargetFusionInfoImpl() = default; + + virtual void notifyFunctionsDelete(llvm::ArrayRef Funcs) { + (void)Funcs; + } + + virtual void addKernelFunction(Function *KernelFunc) { (void)KernelFunc; } + + virtual void postProcessKernel(Function *KernelFunc) { (void)KernelFunc; } + + virtual ArrayRef getKernelMetadataKeys() { return {}; } + + virtual void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) = 0; + +protected: + explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {}; + + llvm::Module *LLVMMod; + + friend class TargetFusionInfo; +}; + +class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { +public: + void addKernelFunction(Function *KernelFunc) override; + + ArrayRef getKernelMetadataKeys() override; + + void postProcessKernel(Function *KernelFunc) override; + + void createBarrierCall(IRBuilderBase& Builder, int BarrierFlags) override; + +private: + using TargetFusionInfoImpl::TargetFusionInfoImpl; +}; + +class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { +public: + void notifyFunctionsDelete(llvm::ArrayRef Funcs) override; + + void addKernelFunction(Function *KernelFunc) override; + + ArrayRef getKernelMetadataKeys() override; + + void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) override; + +private: + using TargetFusionInfoImpl::TargetFusionInfoImpl; +}; + +/// +/// Common interface to target-specific logic around handling of kernel +/// functions. +class TargetFusionInfo { +public: + /// + /// Create the correct target-specific implementation based on the target + /// triple of \p Module. + static TargetFusionInfo getTargetFusionInfo(llvm::Module *Module); + + /// + /// Notify the target-specific implementation that set of functions \p Funcs + /// is about to be erased from the module. This should be called BEFORE + /// erasing the functions. + void notifyFunctionsDelete(llvm::ArrayRef Funcs) { + Impl->notifyFunctionsDelete(Funcs); + } + + /// + /// Notify the target-specific implementation that the function \p KernelFunc + /// was added as a new kernel. This should be called AFTER the function has + /// been added. + void addKernelFunction(llvm::Function *KernelFunc) { + Impl->addKernelFunction(KernelFunc); + } + + /// + /// Target-specific post-processing of the new kernel function \p KernelFunc. + /// This should be called AFTER the function has been added and defined. + void postProcessKernel(Function *KernelFunc) { + Impl->postProcessKernel(KernelFunc); + } + + /// + /// Get the target-specific list of argument metadata attached to each + /// function that should be collected and attached to the fused kernel. + llvm::ArrayRef getKernelMetadataKeys() { + return Impl->getKernelMetadataKeys(); + } + + void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) { + Impl->createBarrierCall(Builder, BarrierFlags); + } + +private: + using ImplPtr = std::shared_ptr; + + TargetFusionInfo(ImplPtr &&I) : Impl{I} {} + + ImplPtr Impl; +}; + +/// +/// Simple helper to collect a target-specific set of kernel argument metadata +/// from input functions and attach it to a fused kernel. +class MetadataCollection { +public: + explicit MetadataCollection(llvm::ArrayRef MDKeys); + + void collectFromFunction(llvm::Function *Func, + const ArrayRef IsArgPresentMask); + + void attachToFunction(llvm::Function *Func); + +private: + llvm::SmallVector Keys; + + llvm::StringMap> Collection; +}; +} // namespace llvm From 894ee89494f467f16e347258246a56bf04a03a5c Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 17 Jan 2023 08:19:21 +0000 Subject: [PATCH 08/25] [SYCL][Fusion] Do not require null terminator Signed-off-by: Lukas Sommer --- .../jit-compiler/lib/translation/KernelTranslation.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 31fa850d58d63..7e1318c70a4a9 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -111,7 +111,10 @@ KernelTranslator::loadLLVMKernel(llvm::LLVMContext &LLVMCtx, llvm::StringRef RawData(reinterpret_cast(BinInfo.BinaryStart), BinInfo.BinarySize); return llvm::parseBitcodeFile( - MemoryBuffer::getMemBuffer(RawData)->getMemBufferRef(), LLVMCtx); + MemoryBuffer::getMemBuffer(RawData, Kernel.Name, + /* RequiresNullTermnator*/ false) + ->getMemBufferRef(), + LLVMCtx); } llvm::Expected> From a909a75bc2fcf3f80ee1835ffdcc6187dfdb6cad Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 17 Jan 2023 14:42:40 +0000 Subject: [PATCH 09/25] [SYCL][Fusion] Refactor more target-specific code Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 1 - .../lib/fusion/FusionPipeline.cpp | 19 ++++- .../lib/translation/KernelTranslation.cpp | 3 - sycl-fusion/passes/CMakeLists.txt | 1 + sycl-fusion/passes/cleanup/Cleanup.cpp | 17 ++-- sycl-fusion/passes/cleanup/Cleanup.h | 4 +- .../internalization/Internalization.cpp | 61 ++++--------- .../passes/kernel-fusion/SYCLKernelFusion.cpp | 13 +-- sycl-fusion/passes/syclcp/SYCLCP.cpp | 12 +-- .../passes/target/TargetFusionInfo.cpp | 60 +++++++++++-- sycl-fusion/passes/target/TargetFusionInfo.h | 85 +++++++++++++++---- 11 files changed, 177 insertions(+), 99 deletions(-) diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index 6c14ac182514a..821ad4cd36369 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -137,7 +137,6 @@ FusionResult KernelFusion::fuseKernels( SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName); - // TODO BinaryFormat TargetFormat = ConfigHelper::get(); if (auto Error = translation::KernelTranslator::translateKernel( diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp index 3ede007aa69a4..fb2e2b15200a7 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp @@ -23,6 +23,7 @@ #ifndef NDEBUG #include "llvm/IR/Verifier.h" #endif // NDEBUG +#include "llvm/ADT/Triple.h" #include "llvm/Passes/PassBuilder.h" #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Scalar/ADCE.h" @@ -35,6 +36,21 @@ using namespace llvm; using namespace jit_compiler; using namespace jit_compiler::fusion; +static unsigned getFlatAddressSpace(Module &Mod) { + // Ideally, we could get this information from the TargetTransformInfo, but + // the SPIR-V backend does not yet seem to have an implementation for that. + llvm::Triple Tri(Mod.getTargetTriple()); + if (Tri.isNVPTX()) { + return 0; + } + if (Tri.isSPIRV() || Tri.isSPIR()) { + return 4; + } + // Identical to the definition of "UninitializedAddressSpace" in + // "InferAddressSpaces.cpp". + return std::numeric_limits::max(); +} + std::unique_ptr FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, int BarriersFlags) { @@ -86,9 +102,8 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, // Run the InferAddressSpace pass to remove as many address-space casts // to/from generic address-space as possible, because these hinder // internalization. - // FIXME: TTI should tell the pass which address space to use. // Ideally, the static compiler should have performed that job. - constexpr unsigned FlatAddressSpace = 4; + unsigned FlatAddressSpace = getFlatAddressSpace(Mod); FPM.addPass(InferAddressSpacesPass(FlatAddressSpace)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 7e1318c70a4a9..3db02dbfd717c 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -217,8 +217,5 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { ASMStream.flush(); } - llvm::dbgs() << "PTX size: " << PTXASM.size() << "\n"; - llvm::dbgs() << "PTX:\n" << PTXASM << "\n"; - return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX); } diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt index 41398f6ee9e99..c74c3c40e34e0 100644 --- a/sycl-fusion/passes/CMakeLists.txt +++ b/sycl-fusion/passes/CMakeLists.txt @@ -8,6 +8,7 @@ add_llvm_library(SYCLKernelFusion MODULE syclcp/SYCLCP.cpp cleanup/Cleanup.cpp debug/PassDebug.cpp + target/TargetFusionInfo.cpp DEPENDS intrinsics_gen diff --git a/sycl-fusion/passes/cleanup/Cleanup.cpp b/sycl-fusion/passes/cleanup/Cleanup.cpp index 07fc86d9d0dd3..a5bc3e634c527 100644 --- a/sycl-fusion/passes/cleanup/Cleanup.cpp +++ b/sycl-fusion/passes/cleanup/Cleanup.cpp @@ -45,7 +45,8 @@ static void copyAttributesFrom(const BitVector &Mask, Function *NF, PAL.getRetAttrs(), Attributes)); } -static Function *createMaskedFunction(const BitVector &Mask, Function *F) { +static Function *createMaskedFunction(const BitVector &Mask, Function *F, + TargetFusionInfo &TFI) { // Declare FunctionType *NFTy = createMaskedFunctionType(Mask, F->getFunctionType()); Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(), @@ -78,7 +79,9 @@ static Function *createMaskedFunction(const BitVector &Mask, Function *F) { } // Erase old function + TFI.notifyFunctionsDelete(F); F->eraseFromParent(); + TFI.addKernelFunction(NF); return NF; } @@ -104,9 +107,9 @@ static void updateArgUsageMask(jit_compiler::SYCLKernelInfo *Info, static void applyArgMask(const jit_compiler::ArgUsageMask &NewArgInfo, const BitVector &Mask, Function *F, - ModuleAnalysisManager &AM) { + ModuleAnalysisManager &AM, TargetFusionInfo &TFI) { // Create the function without the masked-out args. - Function *NF = createMaskedFunction(Mask, F); + Function *NF = createMaskedFunction(Mask, F, TFI); // Update the unused args mask. jit_compiler::SYCLModuleInfo *ModuleInfo = AM.getResult(*NF->getParent()).ModuleInfo; @@ -125,9 +128,7 @@ static void maskMD(const BitVector &Mask, Function *F) { SmallVector> MD; F->getAllMetadata(MD); for (const auto &Entry : MD) { - auto MDKind = Entry.first; - if (MDKind == F->getContext().getMDKindID("reqd_work_group_size") || - MDKind == F->getContext().getMDKindID("work_group_size_hint")) { + if (Entry.second->getNumOperands() != Mask.size()) { // Some metadata, e.g., the metadata for reqd_work_group_size and // work_group_size_hint is independent from the number of arguments // and must not be filtered by the argument usage mask. @@ -144,7 +145,7 @@ static void maskMD(const BitVector &Mask, Function *F) { void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo, Function *F, ModuleAnalysisManager &AM, - ArrayRef MDToErase) { + TargetFusionInfo &TFI, ArrayRef MDToErase) { // Erase metadata. for (auto Key : MDToErase) { F->setMetadata(Key, nullptr); @@ -158,5 +159,5 @@ void llvm::fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo, // Update metadata. maskMD(CleanupMask, F); // Remove arguments. - applyArgMask(ArgUsageInfo, CleanupMask, F, AM); + applyArgMask(ArgUsageInfo, CleanupMask, F, AM, TFI); } diff --git a/sycl-fusion/passes/cleanup/Cleanup.h b/sycl-fusion/passes/cleanup/Cleanup.h index 49619e4b9af07..491d96f46a886 100644 --- a/sycl-fusion/passes/cleanup/Cleanup.h +++ b/sycl-fusion/passes/cleanup/Cleanup.h @@ -10,6 +10,7 @@ #define SYCL_FUSION_PASSES_CLEANUP_H #include "Kernel.h" +#include "target/TargetFusionInfo.h" #include #include #include @@ -25,7 +26,8 @@ namespace llvm { /// @param[in] AM Module analysis manager. /// @param[in] EraseMD Keys of metadata to remove. void fullCleanup(const jit_compiler::ArgUsageMask &ArgUsageInfo, Function *F, - ModuleAnalysisManager &AM, ArrayRef EraseMD); + ModuleAnalysisManager &AM, TargetFusionInfo &TFI, + ArrayRef EraseMD); } // namespace llvm #endif // SYCL_FUSION_PASSES_CLEANUP_H diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index 61f3a0738921b..bca46ba43acb3 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -19,16 +19,12 @@ #include "cleanup/Cleanup.h" #include "debug/PassDebug.h" #include "metadata/MDParsing.h" +#include "target/TargetFusionInfo.h" #define DEBUG_TYPE "sycl-fusion" using namespace llvm; -// Corresponds to definition of spir_private and spir_local in -// "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap". -constexpr static unsigned PrivateAS{0}; -constexpr static unsigned LocalAS{3}; - constexpr static StringLiteral PrivatePromotion{"private"}; constexpr static StringLiteral LocalPromotion{"local"}; constexpr static StringLiteral NoPromotion{"none"}; @@ -44,6 +40,8 @@ struct SYCLInternalizerImpl { StringRef Kind; /// Whether or not to create allocas. bool CreateAllocas; + /// Interface to target-specific information. + TargetFusionInfo TargetInfo; /// Implements internalization the pass run. PreservedAnalyses operator()(Module &M, ModuleAnalysisManager &AM) const; @@ -338,11 +336,14 @@ Error SYCLInternalizerImpl::checkArgsPromotable( /// /// Function to perform the required cleaning actions. -static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal) { +static void cleanup(Function *OldF, Function *NewF, bool KeepOriginal, + const TargetFusionInfo &TFI) { if (!KeepOriginal) { NewF->takeName(OldF); + TFI.notifyFunctionsDelete(OldF); OldF->eraseFromParent(); } + TFI.addKernelFunction(NewF); } void SYCLInternalizerImpl::promoteCall(CallBase *C, const Value *Val, @@ -499,11 +500,6 @@ Value *replaceByNewAlloca(Argument *Arg, unsigned AS, std::size_t LocalSize) { Function *SYCLInternalizerImpl::promoteFunctionArgs( Function *OldF, ArrayRef PromoteToLocal, bool CreateAllocas, bool KeepOriginal) const { - constexpr unsigned AddressSpaceBitWidth{32}; - - auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get( - IntegerType::get(OldF->getContext(), AddressSpaceBitWidth), AS)); - // We first declare the promoted function with the new signature. Function *NewF = getPromotedFunctionDeclaration(OldF, PromoteToLocal, AS, @@ -542,32 +538,9 @@ Function *SYCLInternalizerImpl::promoteFunctionArgs( promoteValue(Arg, LocalSize); } - { - constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"}; - if (auto *AddrspaceMD = - dyn_cast_or_null(NewF->getMetadata(KernelArgAddrSpaceMD))) { - // If we have kernel_arg_addr_space metadata in the original function, - // we should update it in the new one. - SmallVector NewInfo{AddrspaceMD->op_begin(), - AddrspaceMD->op_end()}; - for (auto I : enumerate(PromoteToLocal)) { - if (I.value() == 0) { - continue; - } - const auto Index = I.index(); - if (const auto *PtrTy = - dyn_cast(NewF->getArg(Index)->getType())) { - if (PtrTy->getAddressSpace() == LocalAS) { - NewInfo[Index] = NewAddrspace; - } - } - } - NewF->setMetadata(KernelArgAddrSpaceMD, - MDNode::get(NewF->getContext(), NewInfo)); - } - } + TargetInfo.updateAddressSpaceMetadata(NewF, PromoteToLocal, AS); - cleanup(OldF, NewF, KeepOriginal); + cleanup(OldF, NewF, KeepOriginal, TargetInfo); return NewF; } @@ -625,7 +598,8 @@ SYCLInternalizerImpl::operator()(Module &M, ModuleAnalysisManager &AM) const { return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } -static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) { +static void moduleCleanup(Module &M, ModuleAnalysisManager &AM, + TargetFusionInfo &TFI) { SmallVector ToProcess; for (auto &F : M) { if (F.hasMetadata(SYCLInternalizer::Key)) { @@ -650,24 +624,25 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) { NewArgInfo.push_back(jit_compiler::ArgUsage::Used); } } - fullCleanup(NewArgInfo, F, AM, + fullCleanup(NewArgInfo, F, AM, TFI, {SYCLInternalizer::Key, SYCLInternalizer::LocalSizeKey}); } } PreservedAnalyses llvm::SYCLInternalizer::run(Module &M, ModuleAnalysisManager &AM) { + auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); // Private promotion - const PreservedAnalyses Tmp = - SYCLInternalizerImpl{PrivateAS, PrivatePromotion, true}(M, AM); + const PreservedAnalyses Tmp = SYCLInternalizerImpl{ + TFI.getPrivateAddressSpace(), PrivatePromotion, true, TFI}(M, AM); // Local promotion - PreservedAnalyses Res = - SYCLInternalizerImpl{LocalAS, LocalPromotion, false}(M, AM); + PreservedAnalyses Res = SYCLInternalizerImpl{ + TFI.getLocalAddressSpace(), LocalPromotion, false, TFI}(M, AM); Res.intersect(Tmp); if (!Res.areAllPreserved()) { - moduleCleanup(M, AM); + moduleCleanup(M, AM, TFI); } return Res; } diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp index d3dec7f16a81f..a0079affbaa06 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp @@ -423,8 +423,7 @@ Error SYCLKernelFusion::fuseKernel( ++ParamIndex; } // Add the metadata corresponding to the used arguments to the different - // lists. NOTE: We do not collect the "kernel_arg_name" metadata, because - // the kernel arguments receive new names in the fused kernel. + // lists. MDCollection.collectFromFunction(FF, UsedArgsMask); // Update the fused kernel's KernelInfo with information from this input @@ -484,16 +483,12 @@ Error SYCLKernelFusion::fuseKernel( } // Attach names to the arguments. The name includes a prefix for the kernel - // from which this argument came. The names are also attached as metadata - // with kind "kernel_arg_name". - // NOTE: While the kernel_arg_name metadata is required, naming the - // parameters themselves is not necessary for functionality, it just improves - // readibility for debugging purposes. - SmallVector KernelArgNames; + // from which this argument came. Naming the parameters themselves is not + // necessary for functionality, it just improves readibility for debugging + // purposes. for (const auto &AI : llvm::enumerate(FusedFunction->args())) { auto &ArgName = FusedArgNames[AI.index()]; AI.value().setName(ArgName); - KernelArgNames.push_back(MDString::get(LLVMCtx, ArgName)); } // Attach the fused metadata collected from the different input // kernels to the fused function. diff --git a/sycl-fusion/passes/syclcp/SYCLCP.cpp b/sycl-fusion/passes/syclcp/SYCLCP.cpp index b520620c232d1..5cb9b00433ef7 100644 --- a/sycl-fusion/passes/syclcp/SYCLCP.cpp +++ b/sycl-fusion/passes/syclcp/SYCLCP.cpp @@ -41,7 +41,7 @@ static Expected> getCPFromMD(Function *F) { MDNode *MD = F->getMetadata(SYCLCP::Key); if (!MD) { return createStringError(inconvertibleErrorCode(), - "Private promotion metadata not available"); + "Constant progagation metadata not available"); } for (auto I : enumerate(MD->operands())) { Expected> Val = @@ -205,7 +205,8 @@ static bool propagateConstants(Function *F, ArrayRef Constants) { return Changed; } -static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) { +static void moduleCleanup(Module &M, ModuleAnalysisManager &AM, + TargetFusionInfo &TFI) { SmallVector ToProcess; for (auto &F : M) { if (F.hasMetadata(SYCLCP::Key)) { @@ -219,14 +220,13 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM) { if (const auto *MDS = dyn_cast(I.value().get())) { // A value is masked-out if it has a non-empty MDString if (MDS->getLength() > 0) { - // And is either an integer or a FP number. NewArgInfo.push_back(jit_compiler::ArgUsage::Unused); continue; } } NewArgInfo.push_back(jit_compiler::ArgUsage::Used); } - fullCleanup(NewArgInfo, F, AM, {SYCLCP::Key}); + fullCleanup(NewArgInfo, F, AM, TFI, {SYCLCP::Key}); } } @@ -249,8 +249,10 @@ PreservedAnalyses SYCLCP::run(Module &M, ModuleAnalysisManager &AM) { Changed = propagateConstants(F, *ConstantsOrErr) || Changed; } + auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + if (Changed) { - moduleCleanup(M, AM); + moduleCleanup(M, AM, TFI); } return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp index 191ab676613bb..c17e2464ac323 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.cpp +++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp @@ -25,7 +25,7 @@ TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) { return TargetFusionInfo( std::shared_ptr(new NVPTXTargetFusionInfo(Mod))); } - if (Tri.isSPIRV()) { + if (Tri.isSPIRV() || Tri.isSPIR()) { return TargetFusionInfo( std::shared_ptr(new SPIRVTargetFusionInfo(Mod))); } @@ -36,18 +36,28 @@ TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) { // SPIRVTargetFusionInfo // -void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) { +void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) const { KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL); } -ArrayRef SPIRVTargetFusionInfo::getKernelMetadataKeys() { +ArrayRef SPIRVTargetFusionInfo::getKernelMetadataKeys() const { + // NOTE: We do not collect the "kernel_arg_name" metadata, because + // the kernel arguments receive new names in the fused kernel. static SmallVector Keys{ {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type", "kernel_arg_base_type", "kernel_arg_type_qual"}}; return Keys; } -void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) { +void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) const { + // Attach the kernel_arg_name metadata. + SmallVector KernelArgNames; + for (auto &P : KernelFunc->args()) { + KernelArgNames.push_back(MDString::get(LLVMMod->getContext(), P.getName())); + } + auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames); + KernelFunc->setMetadata("kernel_arg_name", ArgNameMD); + static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; // Remove all existing calls of the ITT instrumentation functions. Insert new @@ -93,7 +103,7 @@ void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) { } void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) { + int BarrierFlags) const { if (BarrierFlags == -1) { return; } @@ -137,12 +147,44 @@ void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC); } +void SPIRVTargetFusionInfo::updateAddressSpaceMetadata( + Function *KernelFunc, ArrayRef LocalSize, + unsigned AddressSpace) const { + static constexpr unsigned AddressSpaceBitWidth{32}; + static constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"}; + + auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get( + IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth), + AddressSpace)); + if (auto *AddrspaceMD = dyn_cast_or_null( + KernelFunc->getMetadata(KernelArgAddrSpaceMD))) { + // If we have kernel_arg_addr_space metadata in the original function, + // we should update it in the new one. + SmallVector NewInfo{AddrspaceMD->op_begin(), + AddrspaceMD->op_end()}; + for (auto I : enumerate(LocalSize)) { + if (I.value() == 0) { + continue; + } + const auto Index = I.index(); + if (const auto *PtrTy = + dyn_cast(KernelFunc->getArg(Index)->getType())) { + if (PtrTy->getAddressSpace() == getLocalAddressSpace()) { + NewInfo[Index] = NewAddrspace; + } + } + } + KernelFunc->setMetadata(KernelArgAddrSpaceMD, + MDNode::get(KernelFunc->getContext(), NewInfo)); + } +} + // // NVPTXTargetFusionInfo // void NVPTXTargetFusionInfo::notifyFunctionsDelete( - llvm::ArrayRef Funcs) { + llvm::ArrayRef Funcs) const { SmallPtrSet DeletedFuncs{Funcs.begin(), Funcs.end()}; SmallVector ValidKernels; auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations"); @@ -165,7 +207,7 @@ void NVPTXTargetFusionInfo::notifyFunctionsDelete( } } -void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) { +void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) const { auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); auto *MDOne = ConstantAsMetadata::get( ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1)); @@ -176,7 +218,7 @@ void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) { NVVMAnnotations->addOperand(Tuple); } -ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() { +ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() const { // FIXME: Check whether we need to take care of sycl_fixed_targets. static SmallVector Keys{{"kernel_arg_buffer_location", "kernel_arg_runtime_aligned", @@ -185,7 +227,7 @@ ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() { } void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) { + int BarrierFlags) const { if (BarrierFlags == -1) { return; } diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h index 653af4904e2ae..c307de2dd764e 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.h +++ b/sycl-fusion/passes/target/TargetFusionInfo.h @@ -6,6 +6,9 @@ // //===----------------------------------------------------------------------===// +#ifndef SYCL_FUSION_PASSES_TARGETFUSIONINFO_H +#define SYCL_FUSION_PASSES_TARGETFUSIONINFO_H + #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" @@ -17,17 +20,33 @@ class TargetFusionInfoImpl { public: virtual ~TargetFusionInfoImpl() = default; - virtual void notifyFunctionsDelete(llvm::ArrayRef Funcs) { + virtual void notifyFunctionsDelete(llvm::ArrayRef Funcs) const { (void)Funcs; } - virtual void addKernelFunction(Function *KernelFunc) { (void)KernelFunc; } + virtual void addKernelFunction(Function *KernelFunc) const { + (void)KernelFunc; + } + + virtual void postProcessKernel(Function *KernelFunc) const { + (void)KernelFunc; + } + + virtual ArrayRef getKernelMetadataKeys() const { return {}; } + + virtual void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const = 0; - virtual void postProcessKernel(Function *KernelFunc) { (void)KernelFunc; } + virtual unsigned getPrivateAddressSpace() const = 0; - virtual ArrayRef getKernelMetadataKeys() { return {}; } + virtual unsigned getLocalAddressSpace() const = 0; - virtual void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) = 0; + virtual void updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const { + (void)KernelFunc; + (void)LocalSize; + } protected: explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {}; @@ -39,13 +58,23 @@ class TargetFusionInfoImpl { class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { public: - void addKernelFunction(Function *KernelFunc) override; + void addKernelFunction(Function *KernelFunc) const override; + + ArrayRef getKernelMetadataKeys() const override; - ArrayRef getKernelMetadataKeys() override; + void postProcessKernel(Function *KernelFunc) const override; - void postProcessKernel(Function *KernelFunc) override; + void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const override; - void createBarrierCall(IRBuilderBase& Builder, int BarrierFlags) override; + // Corresponds to definition of spir_private and spir_local in + // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap". + unsigned getPrivateAddressSpace() const override { return 0; } + unsigned getLocalAddressSpace() const override { return 3; } + + void updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const override; private: using TargetFusionInfoImpl::TargetFusionInfoImpl; @@ -53,13 +82,19 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { public: - void notifyFunctionsDelete(llvm::ArrayRef Funcs) override; + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const override; + + void addKernelFunction(Function *KernelFunc) const override; - void addKernelFunction(Function *KernelFunc) override; + ArrayRef getKernelMetadataKeys() const override; - ArrayRef getKernelMetadataKeys() override; + void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const override; - void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) override; + // Corresponds to the definitions in the LLVM NVPTX backend user guide: + // https://llvm.org/docs/NVPTXUsage.html#address-spaces + unsigned getPrivateAddressSpace() const override { return 0; } + unsigned getLocalAddressSpace() const override { return 3; } private: using TargetFusionInfoImpl::TargetFusionInfoImpl; @@ -79,7 +114,7 @@ class TargetFusionInfo { /// Notify the target-specific implementation that set of functions \p Funcs /// is about to be erased from the module. This should be called BEFORE /// erasing the functions. - void notifyFunctionsDelete(llvm::ArrayRef Funcs) { + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const { Impl->notifyFunctionsDelete(Funcs); } @@ -87,28 +122,40 @@ class TargetFusionInfo { /// Notify the target-specific implementation that the function \p KernelFunc /// was added as a new kernel. This should be called AFTER the function has /// been added. - void addKernelFunction(llvm::Function *KernelFunc) { + void addKernelFunction(llvm::Function *KernelFunc) const { Impl->addKernelFunction(KernelFunc); } /// /// Target-specific post-processing of the new kernel function \p KernelFunc. /// This should be called AFTER the function has been added and defined. - void postProcessKernel(Function *KernelFunc) { + void postProcessKernel(Function *KernelFunc) const { Impl->postProcessKernel(KernelFunc); } /// /// Get the target-specific list of argument metadata attached to each /// function that should be collected and attached to the fused kernel. - llvm::ArrayRef getKernelMetadataKeys() { + llvm::ArrayRef getKernelMetadataKeys() const { return Impl->getKernelMetadataKeys(); } - void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) { + void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const { Impl->createBarrierCall(Builder, BarrierFlags); } + unsigned getPrivateAddressSpace() const { + return Impl->getPrivateAddressSpace(); + } + + unsigned getLocalAddressSpace() const { return Impl->getLocalAddressSpace(); } + + void updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const { + Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace); + } + private: using ImplPtr = std::shared_ptr; @@ -135,3 +182,5 @@ class MetadataCollection { llvm::StringMap> Collection; }; } // namespace llvm + +#endif // SYCL_FUSION_PASSES_TARGETFUSIONINFO_H From 32172f0bf088efd70b854fd334bf9f415aa7706c Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 14 Feb 2023 13:20:48 +0000 Subject: [PATCH 10/25] [SYCL][Fusion] Handle attributes for CUDA fusion Handle reqd_work_group_size and work_group_size_hint attributes. Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/include/JITContext.h | 3 +- .../lib/translation/KernelTranslation.cpp | 42 +++++++++++++++++++ .../lib/translation/KernelTranslation.h | 2 +- .../lib/translation/SPIRVLLVMTranslation.cpp | 37 ---------------- .../lib/translation/SPIRVLLVMTranslation.h | 18 +------- sycl/source/detail/jit_compiler.cpp | 40 ++++++++++++++++++ sycl/source/detail/jit_compiler.hpp | 7 +++- sycl/source/detail/jit_device_binaries.cpp | 6 ++- sycl/source/detail/jit_device_binaries.hpp | 5 ++- 9 files changed, 99 insertions(+), 61 deletions(-) diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h index d4654f19820ce..c26312ceb70e6 100644 --- a/sycl-fusion/jit-compiler/include/JITContext.h +++ b/sycl-fusion/jit-compiler/include/JITContext.h @@ -65,8 +65,7 @@ class JITContext { llvm::LLVMContext *getLLVMContext(); - KernelBinary &emplaceSPIRVBinary(std::string Binary, - BinaryFormat Format); + KernelBinary &emplaceSPIRVBinary(std::string Binary, BinaryFormat Format); std::optional getCacheEntry(CacheKeyT &Identifier) const; diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 3db02dbfd717c..1436f51f40aaa 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -9,6 +9,7 @@ #include "KernelTranslation.h" #include "SPIRVLLVMTranslation.h" #include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/Linker/Linker.h" #include "llvm/MC/TargetRegistry.h" @@ -21,6 +22,43 @@ using namespace jit_compiler; using namespace jit_compiler::translation; using namespace llvm; +/// +/// Get an attribute value consisting of NumValues scalar constant integers +/// from the MDNode. +static void getAttributeValues(std::vector &Values, MDNode *MD) { + for (const auto &MDOp : MD->operands()) { + auto *ConstantMD = cast(MDOp); + auto *ConstInt = cast(ConstantMD->getValue()); + Values.push_back(std::to_string(ConstInt->getZExtValue())); + } +} + +// NOLINTNEXTLINE(readability-identifier-naming) +static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size"; +// NOLINTNEXTLINE(readability-identifier-naming) +static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint"; + +/// +/// Restore kernel attributes for the kernel in Info from the metadata +/// attached to its kernel function in the LLVM module Mod. +/// Currently supported attributes: +/// - reqd_work_group_size +/// - work_group_size_hint +static void restoreKernelAttributes(Module *Mod, SYCLKernelInfo &Info) { + auto *KernelFunction = Mod->getFunction(Info.Name); + assert(KernelFunction && "Kernel function not present in module"); + if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) { + SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR}; + getAttributeValues(ReqdAttr.Values, MD); + Info.Attributes.push_back(ReqdAttr); + } + if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) { + SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR}; + getAttributeValues(HintAttr.Values, MD); + Info.Attributes.push_back(HintAttr); + } +} + llvm::Expected> KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx, std::vector &Kernels) { @@ -100,6 +138,10 @@ KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx, "Number of address bits between SPIR-V modules does not match"); } } + // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or + // 'work_group_size_hint' from metadata attached to the kernel function and + // store it in the SYCLKernelInfo. + restoreKernelAttributes(Result.get(), Kernel); } return std::move(Result); } diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h index 3d6824edbdd37..276b6e536d269 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h @@ -23,7 +23,7 @@ class KernelTranslator { loadKernels(llvm::LLVMContext &LLVMCtx, std::vector &Kernels); static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod, - JITContext &JITCtx, BinaryFormat Format); + JITContext &JITCtx, BinaryFormat Format); private: /// diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp index 1046fd6dc2907..a5effb33a85e1 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp @@ -11,9 +11,7 @@ #include "Kernel.h" #include "LLVMSPIRVLib.h" #include "helper/ErrorHandling.h" -#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/raw_ostream.h" @@ -24,37 +22,6 @@ using namespace jit_compiler; using namespace jit_compiler::translation; using namespace llvm; -void SPIRVLLVMTranslator::getAttributeValues(std::vector &Values, - MDNode *MD, size_t NumValues) { - assert(MD->getNumOperands() == NumValues && "Incorrect number of values"); - for (const auto &MDOp : MD->operands()) { - auto *ConstantMD = cast(MDOp); - auto *ConstInt = cast(ConstantMD->getValue()); - Values.push_back(std::to_string(ConstInt->getZExtValue())); - } -} - -// NOLINTNEXTLINE(readability-identifier-naming) -static const char *REQD_WORK_GROUP_SIZE_ATTR = "reqd_work_group_size"; -// NOLINTNEXTLINE(readability-identifier-naming) -static const char *WORK_GROUP_SIZE_HINT_ATTR = "work_group_size_hint"; - -void SPIRVLLVMTranslator::restoreKernelAttributes(Module *Mod, - SYCLKernelInfo &Info) { - auto *KernelFunction = Mod->getFunction(Info.Name); - assert(KernelFunction && "Kernel function not present in module"); - if (auto *MD = KernelFunction->getMetadata(REQD_WORK_GROUP_SIZE_ATTR)) { - SYCLKernelAttribute ReqdAttr{REQD_WORK_GROUP_SIZE_ATTR}; - getAttributeValues(ReqdAttr.Values, MD, 3); - Info.Attributes.push_back(ReqdAttr); - } - if (auto *MD = KernelFunction->getMetadata(WORK_GROUP_SIZE_HINT_ATTR)) { - SYCLKernelAttribute HintAttr{WORK_GROUP_SIZE_HINT_ATTR}; - getAttributeValues(HintAttr.Values, MD, 3); - Info.Attributes.push_back(HintAttr); - } -} - SPIRV::TranslatorOpts &SPIRVLLVMTranslator::translatorOpts() { static auto Opts = []() -> SPIRV::TranslatorOpts { // Options for translation between SPIR-V and LLVM IR. @@ -113,10 +80,6 @@ SPIRVLLVMTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx, } std::unique_ptr NewMod{LLVMMod}; - // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or - // 'work_group_size_hint' from metadata attached to the kernel function and - // store it in the SYCLKernelInfo. - restoreKernelAttributes(NewMod.get(), Kernel); return std::move(NewMod); } diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h index 440c00103b0d5..c8cdf2bf90ca0 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.h @@ -30,24 +30,10 @@ class SPIRVLLVMTranslator { /// /// Translate the LLVM IR module Mod to SPIR-V, store it in the JITContext and /// return a pointer to its container. - static llvm::Expected translateLLVMtoSPIRV(llvm::Module &Mod, - JITContext &JITCtx); + static llvm::Expected + translateLLVMtoSPIRV(llvm::Module &Mod, JITContext &JITCtx); private: - /// - /// Get an attribute value consisting of NumValues scalar constant integers - /// from the MDNode. - static void getAttributeValues(std::vector &Values, - llvm::MDNode *MD, size_t NumValues); - - /// - /// Restore kernel attributes for the kernel in Info from the metadata - /// attached to its kernel function in the LLVM module Mod. - /// Currently supported attributes: - /// - reqd_work_group_size - /// - work_group_size_hint - static void restoreKernelAttributes(llvm::Module *Mod, SYCLKernelInfo &Info); - /// /// Default settings for the SPIRV translation options. static SPIRV::TranslatorOpts &translatorOpts(); diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 04a09ee84a72e..004fa07145b48 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -933,6 +933,29 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( Binary.addProperty(std::move(ArgMaskPropSet)); + if (Format == ::jit_compiler::BinaryFormat::PTX) { + // Add a program metadata property with the reqd_work_group_size attribute. + // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference. + auto ReqdWGS = std::find_if( + FusedKernelInfo.Attributes.begin(), FusedKernelInfo.Attributes.end(), + [](const ::jit_compiler::SYCLKernelAttribute &Attr) { + return Attr.AttributeName == "reqd_work_group_size"; + }); + if (ReqdWGS != FusedKernelInfo.Attributes.end()) { + auto Encoded = encodeReqdWorkGroupSize(*ReqdWGS); + std::stringstream PropName; + PropName << FusedKernelInfo.Name; + PropName << __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE; + PropertyContainer ReqdWorkGroupSizeProp{ + PropName.str(), Encoded.data(), Encoded.size(), + pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY}; + PropertySetContainer ProgramMetadata{ + __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA}; + ProgramMetadata.addProperty(std::move(ReqdWorkGroupSizeProp)); + Binary.addProperty(std::move(ProgramMetadata)); + } + } + DeviceBinariesCollection Collection; Collection.addDeviceBinary( std::move(Binary), FusedKernelInfo.BinaryInfo.BinaryStart, @@ -972,6 +995,23 @@ std::vector jit_compiler::encodeArgUsageMask( return Encoded; } +std::vector jit_compiler::encodeReqdWorkGroupSize( + const ::jit_compiler::SYCLKernelAttribute &Attr) const { + assert(Attr.AttributeName == "reqd_work_group_size"); + size_t NumBytes = sizeof(uint64_t) + (Attr.Values.size() * sizeof(uint32_t)); + std::vector Encoded(NumBytes, 0u); + uint8_t *Ptr = Encoded.data(); + // Skip 64-bit wide size argument with value 0 at the start of the data. + // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference. + Ptr += sizeof(uint64_t); + for (const auto &Val : Attr.Values) { + uint32_t UVal = std::stoul(Val); + std::memcpy(Ptr, &UVal, sizeof(uint32_t)); + Ptr += sizeof(uint32_t); + } + return Encoded; +} + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index e02be562de3ee..4b299572bacbc 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -16,6 +16,7 @@ namespace jit_compiler { enum class BinaryFormat; class JITContext; struct SYCLKernelInfo; +struct SYCLKernelAttribute; using ArgUsageMask = std::vector; } // namespace jit_compiler @@ -47,11 +48,15 @@ class jit_compiler { jit_compiler &operator=(const jit_compiler &&) = delete; pi_device_binaries - createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, ::jit_compiler::BinaryFormat Format); + createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, + ::jit_compiler::BinaryFormat Format); std::vector encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const; + std::vector encodeReqdWorkGroupSize( + const ::jit_compiler::SYCLKernelAttribute &Attr) const; + // Manages the lifetime of the PI structs for device binaries. std::vector JITDeviceBinaries; diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp index 59530a0e691ce..d0cc9e824bc07 100644 --- a/sycl/source/detail/jit_device_binaries.cpp +++ b/sycl/source/detail/jit_device_binaries.cpp @@ -81,7 +81,8 @@ void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) { } pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( - const unsigned char *BinaryStart, size_t BinarySize, const char* TargetSpec, pi_device_binary_type Format) { + const unsigned char *BinaryStart, size_t BinarySize, const char *TargetSpec, + pi_device_binary_type Format) { pi_device_binary_struct DeviceBinary; DeviceBinary.Version = PI_DEVICE_BINARY_VERSION; DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL; @@ -106,7 +107,8 @@ pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec, pi_device_binary_type Format) { + const char *TargetSpec, + pi_device_binary_type Format) { // Adding to the vectors might trigger reallocation, which would invalidate // the pointers used for PI structs if a PI struct has already been created // via getPIDeviceStruct(). Forbid calls to this method after the first PI diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp index 96079d3a25a19..cecab17870650 100644 --- a/sycl/source/detail/jit_device_binaries.hpp +++ b/sycl/source/detail/jit_device_binaries.hpp @@ -111,7 +111,8 @@ class DeviceBinaryContainer { pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec, pi_device_binary_type Format); + const char *TargetSpec, + pi_device_binary_type Format); private: bool Fused = true; @@ -138,7 +139,7 @@ class DeviceBinariesCollection { void addDeviceBinary(DeviceBinaryContainer &&Cont, const unsigned char *BinaryStart, size_t BinarySize, - const char* TargetSpec, pi_device_binary_type Format); + const char *TargetSpec, pi_device_binary_type Format); pi_device_binaries getPIDeviceStruct(); private: From 1559b85b00a17ad2eadb71fe229217f704038449 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 14 Feb 2023 13:39:10 +0000 Subject: [PATCH 11/25] [SYCL][Fusion] Cache and groom input binaries Parse each input binary only once. Groom the nvvm annotations for functions deleted before fusion. Signed-off-by: Lukas Sommer --- .../jit-compiler/lib/fusion/ModuleHelper.cpp | 8 ++ .../lib/translation/KernelTranslation.cpp | 104 +++++++++--------- 2 files changed, 60 insertions(+), 52 deletions(-) diff --git a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp index c197fedf47e69..0a90600530f74 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp @@ -8,6 +8,7 @@ #include "ModuleHelper.h" +#include "target/TargetFusionInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Function.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -22,6 +23,13 @@ helper::ModuleHelper::cloneAndPruneModule(Module *Mod, SmallPtrSet UnusedFunctions; identifyUnusedFunctions(Mod, CGRoots, UnusedFunctions); + { + auto TFI = llvm::TargetFusionInfo::getTargetFusionInfo(Mod); + SmallVector Unused{UnusedFunctions.begin(), + UnusedFunctions.end()}; + TFI.notifyFunctionsDelete(Unused); + } + // Clone the module, but use an external reference in place of the global // definition for unused functions. auto FunctionCloneMask = [&](const GlobalValue *GV) -> bool { diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 1436f51f40aaa..b877829782658 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -75,68 +75,68 @@ KernelTranslator::loadKernels(llvm::LLVMContext &LLVMCtx, const unsigned char *ModulePtr = BinInfo.BinaryStart; size_t ModuleSize = BinInfo.BinarySize; BinaryBlob BinBlob{ModulePtr, ModuleSize}; - if (ParsedBinaries.contains(BinBlob)) { + if (!ParsedBinaries.contains(BinBlob)) { // Multiple kernels can be stored in the same SPIR-V or LLVM IR module. - // If we encountered the same binary module before, skip. + // We only load if we did not encounter the same binary module before. // NOTE: We compare the pointer as well as the size, in case // a previous kernel only referenced part of the SPIR-V/LLVM IR module. // Not sure this can actually happen, but better safe than sorry. - continue; - } - // Simply load and translate the SPIR-V into the currently still empty - // module. - std::unique_ptr NewMod; + // Simply load and translate the SPIR-V into the currently still empty + // module. + std::unique_ptr NewMod; - switch (BinInfo.Format) { - case BinaryFormat::LLVM: { - auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel); - if (auto Err = ModOrError.takeError()) { - return std::move(Err); + switch (BinInfo.Format) { + case BinaryFormat::LLVM: { + auto ModOrError = loadLLVMKernel(LLVMCtx, Kernel); + if (auto Err = ModOrError.takeError()) { + return std::move(Err); + } + NewMod = std::move(*ModOrError); + break; } - NewMod = std::move(*ModOrError); - break; - } - case BinaryFormat::SPIRV: { - auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel); - if (auto Err = ModOrError.takeError()) { - return std::move(Err); + case BinaryFormat::SPIRV: { + auto ModOrError = loadSPIRVKernel(LLVMCtx, Kernel); + if (auto Err = ModOrError.takeError()) { + return std::move(Err); + } + NewMod = std::move(*ModOrError); + break; } - NewMod = std::move(*ModOrError); - break; - } - default: { - return createStringError( - inconvertibleErrorCode(), - "Failed to load kernel from unsupported input format"); - } - } - - // We do not assume that the input binary information has the address bits - // set, but rather retrieve this information from the SPIR-V/LLVM module's - // data-layout. - BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits(); - - if (First) { - // We can simply assign the module we just loaded from SPIR-V to the - // empty pointer on the first iteration. - Result = std::move(NewMod); - // The first module will dictate the address bits for the remaining. - AddressBits = BinInfo.AddressBits; - First = false; - } else { - // We have already loaded some module, so now we need to - // link the module we just loaded with the result so far. - // FIXME: We allow duplicates to be overridden by the module - // read last. This could cause problems if different modules contain - // definitions with the same name, but different body/content. - // Check that this is not problematic. - Linker::linkModules(*Result, std::move(NewMod), - Linker::Flags::OverrideFromSrc); - if (AddressBits != BinInfo.AddressBits) { + default: { return createStringError( inconvertibleErrorCode(), - "Number of address bits between SPIR-V modules does not match"); + "Failed to load kernel from unsupported input format"); + } + } + + // We do not assume that the input binary information has the address bits + // set, but rather retrieve this information from the SPIR-V/LLVM module's + // data-layout. + BinInfo.AddressBits = NewMod->getDataLayout().getPointerSizeInBits(); + + if (First) { + // We can simply assign the module we just loaded from SPIR-V to the + // empty pointer on the first iteration. + Result = std::move(NewMod); + // The first module will dictate the address bits for the remaining. + AddressBits = BinInfo.AddressBits; + First = false; + } else { + // We have already loaded some module, so now we need to + // link the module we just loaded with the result so far. + // FIXME: We allow duplicates to be overridden by the module + // read last. This could cause problems if different modules contain + // definitions with the same name, but different body/content. + // Check that this is not problematic. + Linker::linkModules(*Result, std::move(NewMod), + Linker::Flags::OverrideFromSrc); + if (AddressBits != BinInfo.AddressBits) { + return createStringError( + inconvertibleErrorCode(), + "Number of address bits between SPIR-V modules does not match"); + } } + ParsedBinaries.insert(BinBlob); } // Restore SYCL/OpenCL kernel attributes such as 'reqd_work_group_size' or // 'work_group_size_hint' from metadata attached to the kernel function and From fd341247108ad692526d4b22719b0b2d66350262 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 14 Feb 2023 15:44:25 +0000 Subject: [PATCH 12/25] [SYCL][Fusion] Disable heterogeneous ND ranges on CUDA Signed-off-by: Lukas Sommer --- sycl-fusion/jit-compiler/CMakeLists.txt | 1 + sycl-fusion/jit-compiler/include/JITContext.h | 1 - sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 15 ++++++++++----- sycl-fusion/passes/CMakeLists.txt | 1 + sycl-fusion/passes/kernel-fusion/Builtins.cpp | 12 ------------ sycl-fusion/passes/kernel-fusion/Builtins.h | 4 ---- sycl/source/detail/jit_compiler.hpp | 2 +- 7 files changed, 13 insertions(+), 23 deletions(-) diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index f67eed5fc6c76..777f06b303e0c 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_library(sycl-fusion ScalarOpts InstCombine Target + TargetParser NVPTX X86 MC diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h index c26312ceb70e6..eda8c83d8b21c 100644 --- a/sycl-fusion/jit-compiler/include/JITContext.h +++ b/sycl-fusion/jit-compiler/include/JITContext.h @@ -88,7 +88,6 @@ class JITContext { mutable MutexT CacheMutex; std::unordered_map Cache; - }; } // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index 821ad4cd36369..e2f321bbcb81b 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -56,6 +56,10 @@ FusionResult KernelFusion::fuseKernels( int BarriersFlags, const std::vector &Internalization, const std::vector &Constants) { + // Initialize the configuration helper to make the options for this invocation + // available (on a per-thread basis). + ConfigHelper::setConfig(std::move(JITConfig)); + const auto NDRanges = gatherNDRanges(KernelInformation); if (!isValidCombination(NDRanges)) { @@ -64,9 +68,12 @@ FusionResult KernelFusion::fuseKernels( "different global sizes in dimensions [2, N) and non-zero offsets"}; } - // Initialize the configuration helper to make the options for this invocation - // available (on a per-thread basis). - ConfigHelper::setConfig(std::move(JITConfig)); + bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges); + + BinaryFormat TargetFormat = ConfigHelper::get(); + if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) { + return FusionResult{"Heterogeneous ND ranges not supported for CUDA"}; + } bool CachingEnabled = ConfigHelper::get(); CacheKeyT CacheKey{KernelsToFuse, @@ -137,8 +144,6 @@ FusionResult KernelFusion::fuseKernels( SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName); - BinaryFormat TargetFormat = ConfigHelper::get(); - if (auto Error = translation::KernelTranslator::translateKernel( FusedKernelInfo, *NewMod, JITCtx, TargetFormat)) { return errorToFusionResult(std::move(Error), diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt index c74c3c40e34e0..95f504091dc30 100644 --- a/sycl-fusion/passes/CMakeLists.txt +++ b/sycl-fusion/passes/CMakeLists.txt @@ -46,6 +46,7 @@ add_llvm_library(SYCLKernelFusionPasses Support TransformUtils Passes + TargetParser ) target_include_directories(SYCLKernelFusionPasses diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.cpp b/sycl-fusion/passes/kernel-fusion/Builtins.cpp index 6de1c40e8f4cd..9cff120cd5888 100644 --- a/sycl-fusion/passes/kernel-fusion/Builtins.cpp +++ b/sycl-fusion/passes/kernel-fusion/Builtins.cpp @@ -595,18 +595,6 @@ jit_compiler::Remapper::remapBuiltins(Function *F, const NDRange &SrcNDRange, return Clone; } -void jit_compiler::barrierCall(IRBuilderBase &Builder, int Flags) { - assert((Flags == 1 || Flags == 2 || Flags == 3) && "Invalid barrier flags"); - - // See - // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- - createSPIRVCall(Builder, BarrierName, - {Builder.getInt32(/*Exec Scope : Workgroup = */ 2), - Builder.getInt32(/*Exec Scope : Workgroup = */ 2), - Builder.getInt32(0x10 | (Flags % 2 == 1 ? 0x100 : 0x0) | - ((Flags >> 1 == 1 ? 0x200 : 0x0)))}); -} - Value *jit_compiler::createSPIRVCall(IRBuilderBase &Builder, StringRef FunctionName, ArrayRef Args) { diff --git a/sycl-fusion/passes/kernel-fusion/Builtins.h b/sycl-fusion/passes/kernel-fusion/Builtins.h index bc8e186f14188..a55b6efab39f9 100644 --- a/sycl-fusion/passes/kernel-fusion/Builtins.h +++ b/sycl-fusion/passes/kernel-fusion/Builtins.h @@ -60,10 +60,6 @@ constexpr llvm::StringLiteral OffloadStartWrapperName{ llvm::Value *getGlobalLinearID(llvm::IRBuilderBase &Builder, const NDRange &FusedNDRange); -/// -/// Creates a call to a barrier function. -void barrierCall(llvm::IRBuilderBase &Builder, int Flags); - /// /// @return A call to a SPIRV function, which will be declared if not already in /// the module. diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index 4b299572bacbc..71a57723ea6bb 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -13,7 +13,7 @@ #include namespace jit_compiler { -enum class BinaryFormat; +enum class BinaryFormat : uint32_t; class JITContext; struct SYCLKernelInfo; struct SYCLKernelAttribute; From fc5efbcf35937b83e3dceb31c48230463431e163 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 14 Feb 2023 16:14:50 +0000 Subject: [PATCH 13/25] [SYCL][Fusion] Enable JIT caching for CUDA fusion Signed-off-by: Lukas Sommer --- sycl/source/detail/jit_compiler.cpp | 11 ++++++++--- sycl/source/detail/jit_compiler.hpp | 4 ++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 004fa07145b48..a7de0dd12f1a4 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -859,9 +859,14 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo, TargetFormat); detail::ProgramManager::getInstance().addImages(PIDeviceBinaries); Handle = OSUtil::getOSModuleHandle(PIDeviceBinaries->DeviceBinaries); - } else if (DebugEnabled) { - // TODO(Lukas): Create correct OSModuleHandle when using a cached binary. - std::cerr << "INFO: Re-using existing device binary for fused kernel\n"; + CachedModules.emplace(FusedKernelInfo.Name, Handle); + } else { + if (DebugEnabled) { + std::cerr << "INFO: Re-using existing device binary for fused kernel\n"; + } + // Retrieve an OSModuleHandle for the cached binary. + assert(CachedModules.count(FusedKernelInfo.Name) && "No cached binary"); + Handle = CachedModules.at(FusedKernelInfo.Name); } // Create a kernel bundle for the fused kernel. diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index 71a57723ea6bb..fae774cadd09a 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -12,6 +12,8 @@ #include #include +#include + namespace jit_compiler { enum class BinaryFormat : uint32_t; class JITContext; @@ -60,6 +62,8 @@ class jit_compiler { // Manages the lifetime of the PI structs for device binaries. std::vector JITDeviceBinaries; + std::unordered_map CachedModules; + std::unique_ptr<::jit_compiler::JITContext> MJITContext; }; From 6c14311e00f3a294ca18dbf846c7cc76683ba7ff Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 15 Feb 2023 12:56:58 +0000 Subject: [PATCH 14/25] [SYCL][Fusion] Catch empty standard arguments Signed-off-by: Lukas Sommer --- sycl/source/detail/jit_compiler.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index a7de0dd12f1a4..ffcdca25584f7 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -457,7 +457,10 @@ static ParamIterator preProcessArguments( // which will go out-of-scope before we execute the fused kernel. Therefore, // we need to copy the argument to a permant location and update the // argument. - Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize); + if (Arg->Arg.MPtr) { + Arg->Arg.MPtr = + storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize); + } // Standard layout arguments do not participate in identical argument // detection, but we still add it to the list here. As the SYCL runtime can // only check the raw bytes for identical content, but is unaware of the @@ -474,6 +477,7 @@ static ParamIterator preProcessArguments( ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex}, Arg->Arg.MPtr, Arg->Arg.MSize); return ++Arg; + } // First check if there's already another parameter with identical // value. From 980d36d539f70ac10cb9511180a8f2e332d4a48c Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 7 Mar 2023 14:03:28 +0000 Subject: [PATCH 15/25] [SYCL][Fusion] Rebase and address feedback Signed-off-by: Lukas Sommer --- clang/include/clang/Driver/Action.h | 2 +- clang/lib/Driver/Action.cpp | 2 +- sycl-fusion/jit-compiler/CMakeLists.txt | 7 +- sycl-fusion/jit-compiler/include/JITContext.h | 7 +- sycl-fusion/jit-compiler/include/Options.h | 1 + sycl-fusion/jit-compiler/lib/JITContext.cpp | 11 +- sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 22 + .../lib/fusion/FusionPipeline.cpp | 4 +- .../jit-compiler/lib/fusion/ModuleHelper.cpp | 2 +- .../lib/translation/KernelTranslation.cpp | 43 +- .../lib/translation/KernelTranslation.h | 10 +- .../lib/translation/SPIRVLLVMTranslation.cpp | 2 +- .../internalization/Internalization.cpp | 2 +- .../passes/kernel-fusion/SYCLKernelFusion.cpp | 13 +- sycl-fusion/passes/syclcp/SYCLCP.cpp | 2 +- .../passes/target/TargetFusionInfo.cpp | 458 +++++++++++------- sycl-fusion/passes/target/TargetFusionInfo.h | 131 +---- sycl/source/detail/device_info.hpp | 19 +- sycl/source/detail/jit_compiler.cpp | 38 +- sycl/source/detail/scheduler/commands.hpp | 2 +- 20 files changed, 416 insertions(+), 362 deletions(-) diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h index 9dab2e32b2ffc..45e9133b7ed13 100644 --- a/clang/include/clang/Driver/Action.h +++ b/clang/include/clang/Driver/Action.h @@ -665,7 +665,7 @@ class OffloadWrapperJobAction : public JobAction { public: OffloadWrapperJobAction(ActionList &Inputs, types::ID Type); OffloadWrapperJobAction(Action *Input, types::ID OutputType, - bool IsEmbeddedIR = false); + bool EmbedIR = false); bool isEmbeddedIR() const { return EmbedIR; } diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp index f55233c59a6db..4cb0225cad293 100644 --- a/clang/lib/Driver/Action.cpp +++ b/clang/lib/Driver/Action.cpp @@ -478,7 +478,7 @@ void OffloadWrapperJobAction::anchor() {} OffloadWrapperJobAction::OffloadWrapperJobAction(ActionList &Inputs, types::ID Type) - : JobAction(OffloadWrapperJobClass, Inputs, Type) {} + : JobAction(OffloadWrapperJobClass, Inputs, Type), EmbedIR(false) {} OffloadWrapperJobAction::OffloadWrapperJobAction(Action *Input, types::ID Type, bool IsEmbeddedIR) diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index 777f06b303e0c..bf323239679b4 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -22,9 +22,8 @@ add_llvm_library(sycl-fusion InstCombine Target TargetParser - NVPTX - X86 MC + ${LLVM_TARGETS_TO_BUILD} ) target_include_directories(sycl-fusion @@ -47,6 +46,10 @@ target_link_libraries(sycl-fusion ${CMAKE_THREAD_LIBS_INIT} ) +if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) + target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_PTX) +endif() + if (BUILD_SHARED_LIBS) if(NOT MSVC AND NOT APPLE) # Manage symbol visibility through the linker to make sure no LLVM symbols diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h index eda8c83d8b21c..69465a74d8371 100644 --- a/sycl-fusion/jit-compiler/include/JITContext.h +++ b/sycl-fusion/jit-compiler/include/JITContext.h @@ -39,7 +39,7 @@ using CacheKeyT = /// Wrapper around a kernel binary. class KernelBinary { public: - explicit KernelBinary(std::string Binary, BinaryFormat Format); + explicit KernelBinary(std::string &&Binary, BinaryFormat Format); jit_compiler::BinaryAddress address() const; @@ -65,7 +65,10 @@ class JITContext { llvm::LLVMContext *getLLVMContext(); - KernelBinary &emplaceSPIRVBinary(std::string Binary, BinaryFormat Format); + template KernelBinary &emplaceKernelBinary(Ts &&...Args) { + WriteLockT WriteLock{BinariesMutex}; + return Binaries.emplace_back(std::forward(Args)...); + } std::optional getCacheEntry(CacheKeyT &Identifier) const; diff --git a/sycl-fusion/jit-compiler/include/Options.h b/sycl-fusion/jit-compiler/include/Options.h index 4fe7787df00db..841a229adb7a3 100644 --- a/sycl-fusion/jit-compiler/include/Options.h +++ b/sycl-fusion/jit-compiler/include/Options.h @@ -10,6 +10,7 @@ #define SYCL_FUSION_JIT_COMPILER_OPTIONS_H #include "Kernel.h" + #include #include diff --git a/sycl-fusion/jit-compiler/lib/JITContext.cpp b/sycl-fusion/jit-compiler/lib/JITContext.cpp index e1dda8b928c45..4499dd93f4d76 100644 --- a/sycl-fusion/jit-compiler/lib/JITContext.cpp +++ b/sycl-fusion/jit-compiler/lib/JITContext.cpp @@ -11,7 +11,7 @@ using namespace jit_compiler; -KernelBinary::KernelBinary(std::string Binary, BinaryFormat Fmt) +KernelBinary::KernelBinary(std::string &&Binary, BinaryFormat Fmt) : Blob{std::move(Binary)}, Format{Fmt} {} jit_compiler::BinaryAddress KernelBinary::address() const { @@ -29,15 +29,6 @@ JITContext::~JITContext() = default; llvm::LLVMContext *JITContext::getLLVMContext() { return LLVMCtx.get(); } -KernelBinary &JITContext::emplaceSPIRVBinary(std::string Binary, - BinaryFormat Format) { - WriteLockT WriteLock{BinariesMutex}; - // NOTE: With C++17, which returns a reference from emplace_back, the - // following code would be even simpler. - Binaries.emplace_back(std::move(Binary), Format); - return Binaries.back(); -} - std::optional JITContext::getCacheEntry(CacheKeyT &Identifier) const { ReadLockT ReadLock{CacheMutex}; diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index e2f321bbcb81b..be7515d935247 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -48,6 +48,22 @@ gatherNDRanges(llvm::ArrayRef KernelInformation) { return NDRanges; } +static bool isTargetFormatSupported(BinaryFormat TargetFormat) { + switch (TargetFormat) { + case BinaryFormat::SPIRV: + return true; + case BinaryFormat::PTX: { +#ifdef FUSION_JIT_SUPPORT_PTX + return true; +#else // FUSION_JIT_SUPPORT_PTX + return false; +#endif // FUSION_JIT_SUPPORT_PTX + } + default: + return false; + } +} + FusionResult KernelFusion::fuseKernels( JITContext &JITCtx, Config &&JITConfig, const std::vector &KernelInformation, @@ -71,6 +87,12 @@ FusionResult KernelFusion::fuseKernels( bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges); BinaryFormat TargetFormat = ConfigHelper::get(); + + if (!isTargetFormatSupported(TargetFormat)) { + return FusionResult( + "Fusion output target format not supported by this build"); + } + if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) { return FusionResult{"Heterogeneous ND ranges not supported for CUDA"}; } diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp index fb2e2b15200a7..965202fe5bbfa 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp @@ -23,8 +23,8 @@ #ifndef NDEBUG #include "llvm/IR/Verifier.h" #endif // NDEBUG -#include "llvm/ADT/Triple.h" #include "llvm/Passes/PassBuilder.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" @@ -103,7 +103,7 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, // to/from generic address-space as possible, because these hinder // internalization. // Ideally, the static compiler should have performed that job. - unsigned FlatAddressSpace = getFlatAddressSpace(Mod); + const unsigned FlatAddressSpace = getFlatAddressSpace(Mod); FPM.addPass(InferAddressSpacesPass(FlatAddressSpace)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } diff --git a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp index 0a90600530f74..0d6ac7f48fbbe 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/ModuleHelper.cpp @@ -24,7 +24,7 @@ helper::ModuleHelper::cloneAndPruneModule(Module *Mod, identifyUnusedFunctions(Mod, CGRoots, UnusedFunctions); { - auto TFI = llvm::TargetFusionInfo::getTargetFusionInfo(Mod); + TargetFusionInfo TFI{Mod}; SmallVector Unused{UnusedFunctions.begin(), UnusedFunctions.end()}; TFI.notifyFunctionsDelete(Unused); diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index b877829782658..8b1f3d913026d 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "KernelTranslation.h" + #include "SPIRVLLVMTranslation.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/Constants.h" @@ -182,7 +183,8 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel, break; } case BinaryFormat::PTX: { - llvm::Expected BinaryOrError = translateToPTX(Mod, JITCtx); + llvm::Expected BinaryOrError = + translateToPTX(Kernel, Mod, JITCtx); if (auto Error = BinaryOrError.takeError()) { return Error; } @@ -215,12 +217,20 @@ KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) { } llvm::Expected -KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { - // FIXME: Can we limit this to the NVPTX specific target? - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllAsmPrinters(); - llvm::InitializeAllTargetMCs(); +KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod, + JITContext &JITCtx) { +#ifndef FUSION_JIT_SUPPORT_PTX + return createStringError(inconvertibleErrorCode(), + "PTX translation not supported in this build"); +#else // FUSION_JIT_SUPPORT_PTX + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXAsmPrinter(); + LLVMInitializeNVPTXTargetMC(); +#endif // FUSION_JIT_SUPPORT_PTX + + static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; + static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; std::string TargetTriple{"nvptx64-nvidia-cuda"}; @@ -231,13 +241,26 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { if (!Target) { return createStringError( inconvertibleErrorCode(), - "Failed to load and translate SPIR-V module with error %s", + "Failed to load and translate PTX LLVM IR module with error %s", ErrorMessage.c_str()); } + llvm::StringRef TargetCPU{"sm_50"}; + llvm::StringRef TargetFeatures{"+sm_50,+ptx76"}; + if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name)) { + if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { + TargetCPU = + KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); + } + if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { + TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) + .getValueAsString(); + } + } + // FIXME: Check whether we can provide more accurate target information here auto *TargetMachine = Target->createTargetMachine( - TargetTriple, "sm_50", "+sm_50,+ptx76", {}, llvm::Reloc::PIC_, + TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_, std::nullopt, llvm::CodeGenOpt::Default); llvm::legacy::PassManager PM; @@ -259,5 +282,5 @@ KernelTranslator::translateToPTX(llvm::Module &Mod, JITContext &JITCtx) { ASMStream.flush(); } - return &JITCtx.emplaceSPIRVBinary(PTXASM, BinaryFormat::PTX); + return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX); } diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h index 276b6e536d269..7e4816df9bf94 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#ifndef SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H +#define SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H #include "JITContext.h" #include "Kernel.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include +#include "llvm/Support/Error.h" #include namespace jit_compiler { @@ -39,8 +41,10 @@ class KernelTranslator { static llvm::Expected translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx); - static llvm::Expected translateToPTX(llvm::Module &Mod, - JITContext &JITCtx); + static llvm::Expected + translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx); }; } // namespace translation } // namespace jit_compiler + +#endif // SYCL_FUSION_JIT_COMPILER_TRANSLATION_KERNELTRANSLATION_H diff --git a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp index a5effb33a85e1..4092f9dd96fc8 100644 --- a/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/SPIRVLLVMTranslation.cpp @@ -94,5 +94,5 @@ SPIRVLLVMTranslator::translateLLVMtoSPIRV(Module &Mod, JITContext &JITCtx) { "Translation of LLVM IR to SPIR-V failed with error %s", ErrMsg.c_str()); } - return &JITCtx.emplaceSPIRVBinary(BinaryStream.str(), BinaryFormat::SPIRV); + return &JITCtx.emplaceKernelBinary(BinaryStream.str(), BinaryFormat::SPIRV); } diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index bca46ba43acb3..3d5c38c799e8f 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -631,7 +631,7 @@ static void moduleCleanup(Module &M, ModuleAnalysisManager &AM, PreservedAnalyses llvm::SYCLInternalizer::run(Module &M, ModuleAnalysisManager &AM) { - auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + TargetFusionInfo TFI{&M}; // Private promotion const PreservedAnalyses Tmp = SYCLInternalizerImpl{ TFI.getPrivateAddressSpace(), PrivatePromotion, true, TFI}(M, AM); diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp index a0079affbaa06..b61a0936eb32e 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp @@ -138,7 +138,7 @@ PreservedAnalyses SYCLKernelFusion::run(Module &M, ModuleAnalysisManager &AM) { AM.getResult(M).ModuleInfo; assert(ModuleInfo && "No module information available"); - auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + TargetFusionInfo TFI{&M}; // Iterate over the functions in the module and locate all // stub functions identified by metadata. @@ -456,11 +456,20 @@ Error SYCLKernelFusion::fuseKernel( FT, GlobalValue::LinkageTypes::ExternalLinkage, M.getDataLayout().getProgramAddressSpace(), KernelName->getString(), &M); { + auto DefaultAttr = FusedFunction->getAttributes(); + // Add uniform function attributes, i.e., attributes with identical value on + // each input function, to the fused function. + auto *FirstFunction = InputFunctions.front().F; + for (const auto &UniformKey : TargetInfo.getUniformKernelAttributes()) { + if (FirstFunction->hasFnAttribute(UniformKey)) { + DefaultAttr = DefaultAttr.addFnAttribute( + LLVMCtx, FirstFunction->getFnAttribute(UniformKey)); + } + } // Add the collected parameter attributes to the fused function. // Copying the parameter attributes from their original definition in the // input kernels should be safe and they most likely can't be deducted later // on, as no caller is present in the module. - auto DefaultAttr = FusedFunction->getAttributes(); auto FusedFnAttrs = AttributeList::get(LLVMCtx, DefaultAttr.getFnAttrs(), DefaultAttr.getRetAttrs(), FusedParamAttributes); diff --git a/sycl-fusion/passes/syclcp/SYCLCP.cpp b/sycl-fusion/passes/syclcp/SYCLCP.cpp index 5cb9b00433ef7..b928a33760da0 100644 --- a/sycl-fusion/passes/syclcp/SYCLCP.cpp +++ b/sycl-fusion/passes/syclcp/SYCLCP.cpp @@ -249,7 +249,7 @@ PreservedAnalyses SYCLCP::run(Module &M, ModuleAnalysisManager &AM) { Changed = propagateConstants(F, *ConstantsOrErr) || Changed; } - auto TFI = TargetFusionInfo::getTargetFusionInfo(&M); + TargetFusionInfo TFI{&M}; if (Changed) { moduleCleanup(M, AM, TFI); diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp index c17e2464ac323..27514bbb80269 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.cpp +++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp @@ -7,234 +7,336 @@ //===----------------------------------------------------------------------===// #include "TargetFusionInfo.h" -#include "llvm/ADT/Triple.h" + #include "llvm/IR/Constants.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/TargetParser/Triple.h" -using namespace llvm; +namespace llvm { +class TargetFusionInfoImpl { -// -// TargetFusionInfo -// +public: + explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {}; -TargetFusionInfo TargetFusionInfo::getTargetFusionInfo(llvm::Module *Mod) { - llvm::Triple Tri(Mod->getTargetTriple()); - if (Tri.isNVPTX()) { - return TargetFusionInfo( - std::shared_ptr(new NVPTXTargetFusionInfo(Mod))); - } - if (Tri.isSPIRV() || Tri.isSPIR()) { - return TargetFusionInfo( - std::shared_ptr(new SPIRVTargetFusionInfo(Mod))); - } - assert(false && "Unsupported target for fusion"); -} + virtual ~TargetFusionInfoImpl() = default; + + virtual void notifyFunctionsDelete( + [[maybe_unused]] llvm::ArrayRef Funcs) const {} + + virtual void addKernelFunction([[maybe_unused]] Function *KernelFunc) const {} + + virtual void postProcessKernel([[maybe_unused]] Function *KernelFunc) const {} + + virtual ArrayRef getKernelMetadataKeys() const { return {}; } + + virtual ArrayRef getUniformKernelAttributes() const { return {}; } + + virtual void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const = 0; + + virtual unsigned getPrivateAddressSpace() const = 0; + + virtual unsigned getLocalAddressSpace() const = 0; + + virtual void + updateAddressSpaceMetadata([[maybe_unused]] Function *KernelFunc, + [[maybe_unused]] ArrayRef LocalSize, + [[maybe_unused]] unsigned AddressSpace) const {} + +protected: + llvm::Module *LLVMMod; +}; // // SPIRVTargetFusionInfo // +class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { +public: + using TargetFusionInfoImpl::TargetFusionInfoImpl; -void SPIRVTargetFusionInfo::addKernelFunction(Function *KernelFunc) const { - KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL); -} - -ArrayRef SPIRVTargetFusionInfo::getKernelMetadataKeys() const { - // NOTE: We do not collect the "kernel_arg_name" metadata, because - // the kernel arguments receive new names in the fused kernel. - static SmallVector Keys{ - {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type", - "kernel_arg_base_type", "kernel_arg_type_qual"}}; - return Keys; -} + void addKernelFunction(Function *KernelFunc) const override { + KernelFunc->setCallingConv(CallingConv::SPIR_KERNEL); + } -void SPIRVTargetFusionInfo::postProcessKernel(Function *KernelFunc) const { - // Attach the kernel_arg_name metadata. - SmallVector KernelArgNames; - for (auto &P : KernelFunc->args()) { - KernelArgNames.push_back(MDString::get(LLVMMod->getContext(), P.getName())); + ArrayRef getKernelMetadataKeys() const override { + // NOTE: We do not collect the "kernel_arg_name" metadata, because + // the kernel arguments receive new names in the fused kernel. + static SmallVector Keys{ + {"kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type", + "kernel_arg_base_type", "kernel_arg_type_qual"}}; + return Keys; } - auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames); - KernelFunc->setMetadata("kernel_arg_name", ArgNameMD); - - static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; - static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; - // Remove all existing calls of the ITT instrumentation functions. Insert new - // ones in the entry block of the fused kernel and every exit block if the - // functions are present in the module. - // We cannot use the existing SPIRITTAnnotations pass, because that pass might - // insert calls to functions not present in the module (e.g., ITT - // instrumentations for barriers). As the JITed module is not linked with - // libdevice anymore, the functions would remain unresolved and cause the - // driver to fail. - Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper); - Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper); - bool InsertWrappers = - ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) && - (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration())); - auto *WrapperFuncTy = FunctionType::get( - Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false); - for (auto &BB : *KernelFunc) { - for (auto Inst = BB.begin(); Inst != BB.end();) { - if (auto *CB = dyn_cast(Inst)) { - if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) { - Inst = Inst->eraseFromParent(); - continue; + + void postProcessKernel(Function *KernelFunc) const override { + // Attach the kernel_arg_name metadata. + SmallVector KernelArgNames; + for (auto &P : KernelFunc->args()) { + KernelArgNames.push_back( + MDString::get(LLVMMod->getContext(), P.getName())); + } + auto *ArgNameMD = MDTuple::get(LLVMMod->getContext(), KernelArgNames); + KernelFunc->setMetadata("kernel_arg_name", ArgNameMD); + + static constexpr auto ITTStartWrapper = "__itt_offload_wi_start_wrapper"; + static constexpr auto ITTFinishWrapper = "__itt_offload_wi_finish_wrapper"; + // Remove all existing calls of the ITT instrumentation functions. Insert + // new ones in the entry block of the fused kernel and every exit block if + // the functions are present in the module. We cannot use the existing + // SPIRITTAnnotations pass, because that pass might insert calls to + // functions not present in the module (e.g., ITT instrumentations for + // barriers). As the JITed module is not linked with libdevice anymore, the + // functions would remain unresolved and cause the driver to fail. + Function *StartWrapperFunc = LLVMMod->getFunction(ITTStartWrapper); + Function *FinishWrapperFunc = LLVMMod->getFunction(ITTFinishWrapper); + bool InsertWrappers = + ((StartWrapperFunc && !StartWrapperFunc->isDeclaration()) && + (FinishWrapperFunc && !FinishWrapperFunc->isDeclaration())); + auto *WrapperFuncTy = FunctionType::get( + Type::getVoidTy(LLVMMod->getContext()), /*isVarArg*/ false); + for (auto &BB : *KernelFunc) { + for (auto Inst = BB.begin(); Inst != BB.end();) { + if (auto *CB = dyn_cast(Inst)) { + if (CB->getCalledFunction()->getName().starts_with("__itt_offload")) { + Inst = Inst->eraseFromParent(); + continue; + } + } + ++Inst; + } + if (InsertWrappers) { + if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { + auto *WrapperCall = + CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI); + WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); } } - ++Inst; } if (InsertWrappers) { - if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { - auto *WrapperCall = - CallInst::Create(WrapperFuncTy, FinishWrapperFunc, "", RI); - WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); - } + KernelFunc->getEntryBlock().getFirstInsertionPt(); + auto *WrapperCall = + CallInst::Create(WrapperFuncTy, StartWrapperFunc, "", + &*KernelFunc->getEntryBlock().getFirstInsertionPt()); + WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); } } - if (InsertWrappers) { - KernelFunc->getEntryBlock().getFirstInsertionPt(); - auto *WrapperCall = - CallInst::Create(WrapperFuncTy, StartWrapperFunc, "", - &*KernelFunc->getEntryBlock().getFirstInsertionPt()); - WrapperCall->setCallingConv(CallingConv::SPIR_FUNC); - } -} -void SPIRVTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const { - if (BarrierFlags == -1) { - return; - } - assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) && - "Invalid barrier flags"); + void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const override { + if (BarrierFlags == -1) { + return; + } + assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) && + "Invalid barrier flags"); + + static const auto FnAttrs = AttributeSet::get( + LLVMMod->getContext(), + {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent), + Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)}); - static const auto FnAttrs = AttributeSet::get( - LLVMMod->getContext(), - {Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::Convergent), - Attribute::get(LLVMMod->getContext(), Attribute::AttrKind::NoUnwind)}); + static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"}; - static constexpr StringLiteral N{"_Z22__spirv_ControlBarrierjjj"}; + Function *F = LLVMMod->getFunction(N); + if (!F) { + constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage; - Function *F = LLVMMod->getFunction(N); - if (!F) { - constexpr auto Linkage = GlobalValue::LinkageTypes::ExternalLinkage; + auto *Ty = FunctionType::get( + Builder.getVoidTy(), + {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()}, + false /* isVarArg*/); - auto *Ty = FunctionType::get( - Builder.getVoidTy(), - {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()}, - false /* isVarArg*/); + F = Function::Create(Ty, Linkage, N, *LLVMMod); - F = Function::Create(Ty, Linkage, N, *LLVMMod); + F->setAttributes( + AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); + F->setCallingConv(CallingConv::SPIR_FUNC); + } + + // See + // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- + SmallVector Args{ + Builder.getInt32(/*Exec Scope : Workgroup = */ 2), + Builder.getInt32(/*Exec Scope : Workgroup = */ 2), + Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) | + ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))}; - F->setAttributes( + auto *BarrierCallInst = Builder.CreateCall(F, Args); + BarrierCallInst->setAttributes( AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); - F->setCallingConv(CallingConv::SPIR_FUNC); + BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC); } - // See - // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id- - SmallVector Args{ - Builder.getInt32(/*Exec Scope : Workgroup = */ 2), - Builder.getInt32(/*Exec Scope : Workgroup = */ 2), - Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) | - ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))}; - - auto *BarrierCallInst = Builder.CreateCall(F, Args); - BarrierCallInst->setAttributes( - AttributeList::get(LLVMMod->getContext(), FnAttrs, {}, {})); - BarrierCallInst->setCallingConv(CallingConv::SPIR_FUNC); -} + // Corresponds to definition of spir_private and spir_local in + // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap". + unsigned getPrivateAddressSpace() const override { return 0; } + unsigned getLocalAddressSpace() const override { return 3; } -void SPIRVTargetFusionInfo::updateAddressSpaceMetadata( - Function *KernelFunc, ArrayRef LocalSize, - unsigned AddressSpace) const { - static constexpr unsigned AddressSpaceBitWidth{32}; - static constexpr StringLiteral KernelArgAddrSpaceMD{"kernel_arg_addr_space"}; - - auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get( - IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth), - AddressSpace)); - if (auto *AddrspaceMD = dyn_cast_or_null( - KernelFunc->getMetadata(KernelArgAddrSpaceMD))) { - // If we have kernel_arg_addr_space metadata in the original function, - // we should update it in the new one. - SmallVector NewInfo{AddrspaceMD->op_begin(), - AddrspaceMD->op_end()}; - for (auto I : enumerate(LocalSize)) { - if (I.value() == 0) { - continue; - } - const auto Index = I.index(); - if (const auto *PtrTy = - dyn_cast(KernelFunc->getArg(Index)->getType())) { - if (PtrTy->getAddressSpace() == getLocalAddressSpace()) { - NewInfo[Index] = NewAddrspace; + void updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const override { + static constexpr unsigned AddressSpaceBitWidth{32}; + static constexpr StringLiteral KernelArgAddrSpaceMD{ + "kernel_arg_addr_space"}; + + auto *NewAddrspace = ConstantAsMetadata::get(ConstantInt::get( + IntegerType::get(LLVMMod->getContext(), AddressSpaceBitWidth), + AddressSpace)); + if (auto *AddrspaceMD = dyn_cast_or_null( + KernelFunc->getMetadata(KernelArgAddrSpaceMD))) { + // If we have kernel_arg_addr_space metadata in the original function, + // we should update it in the new one. + SmallVector NewInfo{AddrspaceMD->op_begin(), + AddrspaceMD->op_end()}; + for (auto I : enumerate(LocalSize)) { + if (I.value() == 0) { + continue; + } + const auto Index = I.index(); + if (const auto *PtrTy = + dyn_cast(KernelFunc->getArg(Index)->getType())) { + if (PtrTy->getAddressSpace() == getLocalAddressSpace()) { + NewInfo[Index] = NewAddrspace; + } } } + KernelFunc->setMetadata(KernelArgAddrSpaceMD, + MDNode::get(KernelFunc->getContext(), NewInfo)); } - KernelFunc->setMetadata(KernelArgAddrSpaceMD, - MDNode::get(KernelFunc->getContext(), NewInfo)); } -} +}; // // NVPTXTargetFusionInfo // +class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { +public: + using TargetFusionInfoImpl::TargetFusionInfoImpl; -void NVPTXTargetFusionInfo::notifyFunctionsDelete( - llvm::ArrayRef Funcs) const { - SmallPtrSet DeletedFuncs{Funcs.begin(), Funcs.end()}; - SmallVector ValidKernels; - auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations"); - for (auto *Op : OldAnnotations->operands()) { - if (auto *TOp = dyn_cast(Op)) { - if (auto *COp = dyn_cast_if_present( - TOp->getOperand(0).get())) { - if (!DeletedFuncs.contains(COp->getValue())) { - ValidKernels.push_back(Op); - // Add to the set to also remove duplicate entries. - DeletedFuncs.insert(COp->getValue()); + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const override { + SmallPtrSet DeletedFuncs{Funcs.begin(), Funcs.end()}; + SmallVector ValidKernels; + auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations"); + for (auto *Op : OldAnnotations->operands()) { + if (auto *TOp = dyn_cast(Op)) { + if (auto *COp = dyn_cast_if_present( + TOp->getOperand(0).get())) { + if (!DeletedFuncs.contains(COp->getValue())) { + ValidKernels.push_back(Op); + // Add to the set to also remove duplicate entries. + DeletedFuncs.insert(COp->getValue()); + } } } } + LLVMMod->eraseNamedMetadata(OldAnnotations); + auto *NewAnnotations = + LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + for (auto *Kernel : ValidKernels) { + NewAnnotations->addOperand(Kernel); + } + } + + void addKernelFunction(Function *KernelFunc) const override { + auto *NVVMAnnotations = + LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + auto *MDOne = ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1)); + auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel"); + auto *MDFunc = ConstantAsMetadata::get(KernelFunc); + SmallVector KernelMD({MDFunc, MDKernelString, MDOne}); + auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD); + NVVMAnnotations->addOperand(Tuple); + } + + ArrayRef getKernelMetadataKeys() const override { + // FIXME: Check whether we need to take care of sycl_fixed_targets. + static SmallVector Keys{{"kernel_arg_buffer_location", + "kernel_arg_runtime_aligned", + "kernel_arg_exclusive_ptr"}}; + return Keys; + } + + ArrayRef getUniformKernelAttributes() const override { + static SmallVector Keys{ + {"target-cpu", "target-features", "uniform-work-group-size"}}; + return Keys; + } + + void createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const override { + if (BarrierFlags == -1) { + return; + } + // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX + // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0 + // instruction, equivalent to the __syncthreads() call in CUDA." + Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {}); + } + + // Corresponds to the definitions in the LLVM NVPTX backend user guide: + // https://llvm.org/docs/NVPTXUsage.html#address-spaces + unsigned getPrivateAddressSpace() const override { return 0; } + unsigned getLocalAddressSpace() const override { return 3; } +}; + +// +// TargetFusionInfo +// + +TargetFusionInfo::TargetFusionInfo(llvm::Module *Mod) { + llvm::Triple Tri(Mod->getTargetTriple()); + if (Tri.isNVPTX()) { + Impl = std::make_shared(Mod); + return; } - LLVMMod->eraseNamedMetadata(OldAnnotations); - auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); - for (auto *Kernel : ValidKernels) { - NewAnnotations->addOperand(Kernel); + if (Tri.isSPIRV() || Tri.isSPIR()) { + Impl = std::make_shared(Mod); + return; } + llvm_unreachable("Unsupported target for fusion"); } -void NVPTXTargetFusionInfo::addKernelFunction(Function *KernelFunc) const { - auto *NVVMAnnotations = LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); - auto *MDOne = ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1)); - auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel"); - auto *MDFunc = ConstantAsMetadata::get(KernelFunc); - SmallVector KernelMD({MDFunc, MDKernelString, MDOne}); - auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD); - NVVMAnnotations->addOperand(Tuple); +void TargetFusionInfo::notifyFunctionsDelete( + llvm::ArrayRef Funcs) const { + Impl->notifyFunctionsDelete(Funcs); } -ArrayRef NVPTXTargetFusionInfo::getKernelMetadataKeys() const { - // FIXME: Check whether we need to take care of sycl_fixed_targets. - static SmallVector Keys{{"kernel_arg_buffer_location", - "kernel_arg_runtime_aligned", - "kernel_arg_exclusive_ptr"}}; - return Keys; +void TargetFusionInfo::addKernelFunction(llvm::Function *KernelFunc) const { + Impl->addKernelFunction(KernelFunc); } -void NVPTXTargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const { - if (BarrierFlags == -1) { - return; - } - // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX - // backend: "The ‘@llvm.nvvm.barrier0()’ intrinsic emits a PTX bar.sync 0 - // instruction, equivalent to the __syncthreads() call in CUDA." - Builder.CreateIntrinsic(Intrinsic::NVVMIntrinsics::nvvm_barrier0, {}, {}); +void TargetFusionInfo::postProcessKernel(Function *KernelFunc) const { + Impl->postProcessKernel(KernelFunc); +} + +llvm::ArrayRef +TargetFusionInfo::getKernelMetadataKeys() const { + return Impl->getKernelMetadataKeys(); +} + +void TargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, + int BarrierFlags) const { + Impl->createBarrierCall(Builder, BarrierFlags); +} + +unsigned TargetFusionInfo::getPrivateAddressSpace() const { + return Impl->getPrivateAddressSpace(); +} + +unsigned TargetFusionInfo::getLocalAddressSpace() const { + return Impl->getLocalAddressSpace(); +} + +void TargetFusionInfo::updateAddressSpaceMetadata(Function *KernelFunc, + ArrayRef LocalSize, + unsigned AddressSpace) const { + Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace); +} + +llvm::ArrayRef +TargetFusionInfo::getUniformKernelAttributes() const { + return Impl->getUniformKernelAttributes(); } // @@ -265,3 +367,5 @@ void MetadataCollection::attachToFunction(llvm::Function *Func) { Func->setMetadata(Key, MDEntries); } } + +} // namespace llvm diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h index c307de2dd764e..f88476c01ebc3 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.h +++ b/sycl-fusion/passes/target/TargetFusionInfo.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef SYCL_FUSION_PASSES_TARGETFUSIONINFO_H -#define SYCL_FUSION_PASSES_TARGETFUSIONINFO_H +#ifndef SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H +#define SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -15,90 +15,7 @@ namespace llvm { -class TargetFusionInfoImpl { - -public: - virtual ~TargetFusionInfoImpl() = default; - - virtual void notifyFunctionsDelete(llvm::ArrayRef Funcs) const { - (void)Funcs; - } - - virtual void addKernelFunction(Function *KernelFunc) const { - (void)KernelFunc; - } - - virtual void postProcessKernel(Function *KernelFunc) const { - (void)KernelFunc; - } - - virtual ArrayRef getKernelMetadataKeys() const { return {}; } - - virtual void createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const = 0; - - virtual unsigned getPrivateAddressSpace() const = 0; - - virtual unsigned getLocalAddressSpace() const = 0; - - virtual void updateAddressSpaceMetadata(Function *KernelFunc, - ArrayRef LocalSize, - unsigned AddressSpace) const { - (void)KernelFunc; - (void)LocalSize; - } - -protected: - explicit TargetFusionInfoImpl(llvm::Module *Mod) : LLVMMod{Mod} {}; - - llvm::Module *LLVMMod; - - friend class TargetFusionInfo; -}; - -class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { -public: - void addKernelFunction(Function *KernelFunc) const override; - - ArrayRef getKernelMetadataKeys() const override; - - void postProcessKernel(Function *KernelFunc) const override; - - void createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const override; - - // Corresponds to definition of spir_private and spir_local in - // "clang/lib/Basic/Target/SPIR.h", "SPIRDefIsGenMap". - unsigned getPrivateAddressSpace() const override { return 0; } - unsigned getLocalAddressSpace() const override { return 3; } - - void updateAddressSpaceMetadata(Function *KernelFunc, - ArrayRef LocalSize, - unsigned AddressSpace) const override; - -private: - using TargetFusionInfoImpl::TargetFusionInfoImpl; -}; - -class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { -public: - void notifyFunctionsDelete(llvm::ArrayRef Funcs) const override; - - void addKernelFunction(Function *KernelFunc) const override; - - ArrayRef getKernelMetadataKeys() const override; - - void createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const override; - - // Corresponds to the definitions in the LLVM NVPTX backend user guide: - // https://llvm.org/docs/NVPTXUsage.html#address-spaces - unsigned getPrivateAddressSpace() const override { return 0; } - unsigned getLocalAddressSpace() const override { return 3; } - -private: - using TargetFusionInfoImpl::TargetFusionInfoImpl; -}; +class TargetFusionInfoImpl; /// /// Common interface to target-specific logic around handling of kernel @@ -108,59 +25,49 @@ class TargetFusionInfo { /// /// Create the correct target-specific implementation based on the target /// triple of \p Module. - static TargetFusionInfo getTargetFusionInfo(llvm::Module *Module); + explicit TargetFusionInfo(llvm::Module *Module); /// /// Notify the target-specific implementation that set of functions \p Funcs /// is about to be erased from the module. This should be called BEFORE /// erasing the functions. - void notifyFunctionsDelete(llvm::ArrayRef Funcs) const { - Impl->notifyFunctionsDelete(Funcs); - } + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const; /// /// Notify the target-specific implementation that the function \p KernelFunc /// was added as a new kernel. This should be called AFTER the function has /// been added. - void addKernelFunction(llvm::Function *KernelFunc) const { - Impl->addKernelFunction(KernelFunc); - } + void addKernelFunction(llvm::Function *KernelFunc) const; /// /// Target-specific post-processing of the new kernel function \p KernelFunc. /// This should be called AFTER the function has been added and defined. - void postProcessKernel(Function *KernelFunc) const { - Impl->postProcessKernel(KernelFunc); - } + void postProcessKernel(Function *KernelFunc) const; /// /// Get the target-specific list of argument metadata attached to each /// function that should be collected and attached to the fused kernel. - llvm::ArrayRef getKernelMetadataKeys() const { - return Impl->getKernelMetadataKeys(); - } + llvm::ArrayRef getKernelMetadataKeys() const; + + /// + /// Get the target-specific list of kernel function attributes that are + /// uniform across all input kernels and should be attached to the fused + /// kernel. + llvm::ArrayRef getUniformKernelAttributes() const; - void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const { - Impl->createBarrierCall(Builder, BarrierFlags); - } + void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const; - unsigned getPrivateAddressSpace() const { - return Impl->getPrivateAddressSpace(); - } + unsigned getPrivateAddressSpace() const; - unsigned getLocalAddressSpace() const { return Impl->getLocalAddressSpace(); } + unsigned getLocalAddressSpace() const; void updateAddressSpaceMetadata(Function *KernelFunc, ArrayRef LocalSize, - unsigned AddressSpace) const { - Impl->updateAddressSpaceMetadata(KernelFunc, LocalSize, AddressSpace); - } + unsigned AddressSpace) const; private: using ImplPtr = std::shared_ptr; - TargetFusionInfo(ImplPtr &&I) : Impl{I} {} - ImplPtr Impl; }; @@ -183,4 +90,4 @@ class MetadataCollection { }; } // namespace llvm -#endif // SYCL_FUSION_PASSES_TARGETFUSIONINFO_H +#endif // SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index efb105d1ba090..1a34178a4a3c2 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -105,18 +105,10 @@ affinityDomainToString(info::partition_affinity_domain AffinityDomain) { } // Mapping expected SYCL return types to those returned by PI calls -template struct sycl_to_pi { - using type = T; -}; -template <> struct sycl_to_pi { - using type = pi_bool; -}; -template <> struct sycl_to_pi { - using type = RT::PiDevice; -}; -template <> struct sycl_to_pi { - using type = RT::PiPlatform; -}; +template struct sycl_to_pi { using type = T; }; +template <> struct sycl_to_pi { using type = pi_bool; }; +template <> struct sycl_to_pi { using type = RT::PiDevice; }; +template <> struct sycl_to_pi { using type = RT::PiPlatform; }; // Mapping fp_config device info types to the values used to check fp support template struct check_fp_support {}; @@ -771,7 +763,8 @@ struct get_device_info_impl< // Currently fusion is only supported for SPIR-V based backends, i.e. OpenCL // and LevelZero. return (Dev->getBackend() == backend::ext_oneapi_level_zero) || - (Dev->getBackend() == backend::opencl); + (Dev->getBackend() == backend::opencl) || + (Dev->getBackend() == backend::ext_oneapi_cuda); #else // SYCL_EXT_CODEPLAY_KERNEL_FUSION (void)Dev; return false; diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index ffcdca25584f7..ae20134d46b8c 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -62,21 +62,16 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { backend::ext_oneapi_cuda; if (isNvidia) { auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName); - std::vector KernelIds; - KernelIds.push_back(KernelID); + std::vector KernelIds{KernelID}; auto DeviceImages = ProgramManager::getInstance().getRawDeviceImages(KernelIds); - const RTDeviceBinaryImage *DeviceImage = nullptr; - for (auto *DI : DeviceImages) { - // We are looking for a device image with LLVM IR format and target spec - // "llvm_nvptx64", which has been set by the offload-wrapper action. - if (DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE && - DI->getRawData().DeviceTargetSpec == std::string("llvm_nvptx64")) { - DeviceImage = DI; - break; - } - } - if (!DeviceImage) { + auto DeviceImage = std::find_if( + DeviceImages.begin(), DeviceImages.end(), [](RTDeviceBinaryImage *DI) { + return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE && + DI->getRawData().DeviceTargetSpec == + std::string("llvm_nvptx64"); + }); + if (DeviceImage == DeviceImages.end()) { return {nullptr, nullptr}; } auto ContextImpl = Queue->getContextImplPtr(); @@ -84,9 +79,9 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { auto DeviceImpl = Queue->getDeviceImplPtr(); auto Device = detail::createSyclObjFromImpl(DeviceImpl); RT::PiProgram Program = - detail::ProgramManager::getInstance().createPIProgram(*DeviceImage, + detail::ProgramManager::getInstance().createPIProgram(**DeviceImage, Context, Device); - return {DeviceImage, Program}; + return {*DeviceImage, Program}; } const RTDeviceBinaryImage *DeviceImage = nullptr; @@ -460,6 +455,11 @@ static ParamIterator preProcessArguments( if (Arg->Arg.MPtr) { Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize); + // Propagate values of scalar parameters as constants to the JIT + // compiler. + JITConstants.emplace_back( + ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex}, + Arg->Arg.MPtr, Arg->Arg.MSize); } // Standard layout arguments do not participate in identical argument // detection, but we still add it to the list here. As the SYCL runtime can @@ -468,16 +468,10 @@ static ParamIterator preProcessArguments( // not be materialized by the JIT compiler. Instead of removing some // standard layout arguments due to identity and missing some in case the // materialization is not possible, we rely on constant propagation to - // replace standard layout arguments by constants (see below). + // replace standard layout arguments by constants. NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex, true); - // Propagate values of scalar parameters as constants to the JIT - // compiler. - JITConstants.emplace_back( - ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex}, - Arg->Arg.MPtr, Arg->Arg.MSize); return ++Arg; - } // First check if there's already another parameter with identical // value. diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index a019ab9378226..641345aa74353 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -289,7 +289,7 @@ class Command { // XPTI instrumentation. Copy code location details to the internal struct. // Memory is allocated in this method and released in destructor. void copySubmissionCodeLocation(); - + /// Clear all dependency events for device and host dependencies. This should /// only be used if a command is about to be deleted without being executed /// before that. From 4ba8e44e1ec2feaa508c000553af20ece976477e Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 22 Mar 2023 16:40:10 +0000 Subject: [PATCH 16/25] [SYCL][Fusion] Update linkage graph diagram Signed-off-by: Lukas Sommer --- clang/lib/Driver/Driver.cpp | 106 ++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index a7b15f4d8a9e5..6cb49ba5a0050 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -5517,6 +5517,8 @@ class OffloadingActionBuilder final { // s - device code split requested // r - relocatable device code is requested // f - link object output type is TY_Tempfilelist (fat archive) + // e - Embedded IR for fusion (-fsycl-embed-ir) was requested + // and target is NVPTX. // * - "all other cases" // - no condition means output/input is "always" present // First symbol indicates output/input type @@ -5536,58 +5538,58 @@ class OffloadingActionBuilder final { // | | // | | // .---------------------------------------. - // | PostLink | - // .---------------------------------------. - // [+*] [+] - // | | - // | | - // |--------- | - // | | | - // | | | - // | [+!rf] | - // | .-------------. | - // | | llvm-foreach| | - // | .-------------. | - // | | | - // [+*] [+!rf] | - // .-----------------. | - // | FileTableTform | | - // | (extract "Code")| | - // .-----------------. | - // [-] |----------- - // --------------------| | - // | | | - // | |----------------- | - // | | | | - // | | [-!rf] | - // | | .--------------. | - // | | |FileTableTform| | - // | | | (merge) | | - // | | .--------------. | - // | | [-] |------- - // | | | | | - // | | | ------| | - // | | --------| | | - // [.] [-*] [-!rf] [+!rf] | - // .---------------. .-------------------. .--------------. | - // | finalizeNVPTX | | SPIRVTranslator | |FileTableTform| | - // | finalizeAMDGCN | | | | (merge) | | - // .---------------. .-------------------. . -------------. | - // [.] [-as] [-!a] | | - // | | | | | - // | [-s] | | | - // | .----------------. | | | - // | | BackendCompile | | | | - // | .----------------. | ------| | - // | [-s] | | | - // | | | | | - // | [-a] [-!a] [-!rf] | - // | .--------------------. | - // -----------[-n]| FileTableTform |[+*]--------------| - // | (replace "Code") | - // .--------------------. - // | - // [+*] + // | PostLink |[+e]---------------- + // .---------------------------------------. | + // [+*] [+] | + // | | | + // | | | + // |--------- | | + // | | | | + // | | | | + // | [+!rf] | | + // | .-------------. | | + // | | llvm-foreach| | | + // | .-------------. | | + // | | | | + // [+*] [+!rf] | | + // .-----------------. | | + // | FileTableTform | | | + // | (extract "Code")| | | + // .-----------------. | | + // [-] |----------- | + // --------------------| | | + // | | | | + // | |----------------- | | + // | | | | | + // | | [-!rf] | | + // | | .--------------. | | + // | | |FileTableTform| | | + // | | | (merge) | | | + // | | .--------------. | | + // | | [-] |------- | + // | | | | | | + // | | | ------| | | + // | | --------| | | | + // [.] [-*] [-!rf] [+!rf] | | + // .---------------. .-------------------. .--------------. | | + // | finalizeNVPTX | | SPIRVTranslator | |FileTableTform| | | + // | finalizeAMDGCN | | | | (merge) | | | + // .---------------. .-------------------. . -------------. | | + // [.] [-as] [-!a] | | | + // | | | | | | + // | [-s] | | | | + // | .----------------. | | | | + // | | BackendCompile | | | | | + // | .----------------. | ------| | | + // | [-s] | | | | + // | | | | | | + // | [-a] [-!a] [-!rf] | | + // | .--------------------. | | + // -----------[-n]| FileTableTform |[+*]--------------| | + // | (replace "Code") | | + // .--------------------. | + // | ------------------------- + // [+*] | [+e] // .--------------------------------------. // | OffloadWrapper | // .--------------------------------------. From 8fcd4c7224f33dde24d9314fa12bf2f5f3126034 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 28 Mar 2023 13:19:51 +0100 Subject: [PATCH 17/25] Don't compile NVPTX-specifics if not supported Signed-off-by: Lukas Sommer --- .../jit-compiler/lib/translation/KernelTranslation.cpp | 2 +- sycl-fusion/passes/CMakeLists.txt | 8 ++++++++ sycl-fusion/passes/target/TargetFusionInfo.cpp | 4 ++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 8b1f3d913026d..395cd64f0f913 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -227,7 +227,6 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod, LLVMInitializeNVPTXTarget(); LLVMInitializeNVPTXAsmPrinter(); LLVMInitializeNVPTXTargetMC(); -#endif // FUSION_JIT_SUPPORT_PTX static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; @@ -283,4 +282,5 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod, } return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX); +#endif // FUSION_JIT_SUPPORT_PTX } diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt index 95f504091dc30..4693083be4faa 100644 --- a/sycl-fusion/passes/CMakeLists.txt +++ b/sycl-fusion/passes/CMakeLists.txt @@ -26,6 +26,10 @@ target_link_libraries(SYCLKernelFusion sycl-fusion-common ) +if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) + target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_PTX) +endif() + # Static library for linking with the jit_compiler add_llvm_library(SYCLKernelFusionPasses SYCLFusionPasses.cpp @@ -60,3 +64,7 @@ target_link_libraries(SYCLKernelFusionPasses PRIVATE sycl-fusion-common ) + +if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) + target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_PTX) +endif() diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp index 27514bbb80269..2e534c35037fd 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.cpp +++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp @@ -209,6 +209,7 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { // // NVPTXTargetFusionInfo // +#ifdef FUSION_JIT_SUPPORT_PTX class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { public: using TargetFusionInfoImpl::TargetFusionInfoImpl; @@ -279,6 +280,7 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { unsigned getPrivateAddressSpace() const override { return 0; } unsigned getLocalAddressSpace() const override { return 3; } }; +#endif // FUSION_JIT_SUPPORT_PTX // // TargetFusionInfo @@ -286,10 +288,12 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { TargetFusionInfo::TargetFusionInfo(llvm::Module *Mod) { llvm::Triple Tri(Mod->getTargetTriple()); +#ifdef FUSION_JIT_SUPPORT_PTX if (Tri.isNVPTX()) { Impl = std::make_shared(Mod); return; } +#endif // FUSION_JIT_SUPPORT_PTX if (Tri.isSPIRV() || Tri.isSPIR()) { Impl = std::make_shared(Mod); return; From 75a77fd90cc1044f70ca3eb0405d99b82dbd56fc Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 28 Mar 2023 15:30:53 +0100 Subject: [PATCH 18/25] Migrate test changes from intel/llvm-test-suite Signed-off-by: Lukas Sommer --- sycl/test-e2e/KernelFusion/abort_fusion.cpp | 4 +- .../KernelFusion/abort_internalization.cpp | 10 +- .../abort_internalization_stored_ptr.cpp | 4 +- .../barrier_local_internalization.cpp | 4 +- .../KernelFusion/buffer_internalization.cpp | 4 +- sycl/test-e2e/KernelFusion/cancel_fusion.cpp | 4 +- .../test-e2e/KernelFusion/complete_fusion.cpp | 4 +- .../KernelFusion/device_info_descriptor.cpp | 2 +- sycl/test-e2e/KernelFusion/diamond_shape.cpp | 4 +- .../KernelFusion/diamond_shape_local.cpp | 111 ++++++++++++++++++ .../KernelFusion/event_wait_cancel.cpp | 4 +- .../KernelFusion/event_wait_complete.cpp | 4 +- .../KernelFusion/existing_local_accessor.cpp | 78 ++++++++++++ .../internal_explicit_dependency.cpp | 4 +- .../internalize_array_wrapper.cpp | 4 +- .../KernelFusion/internalize_deep.cpp | 4 +- .../KernelFusion/internalize_multi_ptr.cpp | 4 +- .../test-e2e/KernelFusion/internalize_vec.cpp | 4 +- .../KernelFusion/internalize_vfunc.cpp | 4 +- sycl/test-e2e/KernelFusion/jit_caching.cpp | 4 +- .../KernelFusion/local_internalization.cpp | 4 +- .../KernelFusion/non_unit_local_size.cpp | 4 +- .../KernelFusion/pointer_arg_function.cpp | 4 +- .../KernelFusion/private_internalization.cpp | 4 +- .../KernelFusion/ranged_offset_accessor.cpp | 4 +- .../KernelFusion/struct_with_array.cpp | 4 +- .../test-e2e/KernelFusion/sync_acc_mem_op.cpp | 4 +- .../KernelFusion/sync_buffer_destruction.cpp | 4 +- .../test-e2e/KernelFusion/sync_event_wait.cpp | 4 +- .../KernelFusion/sync_host_accessor.cpp | 4 +- sycl/test-e2e/KernelFusion/sync_host_task.cpp | 4 +- .../KernelFusion/sync_queue_destruction.cpp | 4 +- .../test-e2e/KernelFusion/sync_queue_wait.cpp | 4 +- .../KernelFusion/sync_second_queue.cpp | 4 +- .../sync_two_queues_event_dep.cpp | 2 +- .../sync_two_queues_requirement.cpp | 4 +- .../test-e2e/KernelFusion/sync_usm_mem_op.cpp | 8 +- .../KernelFusion/three_dimensional.cpp | 4 +- .../test-e2e/KernelFusion/two_dimensional.cpp | 4 +- .../KernelFusion/usm_no_dependencies.cpp | 4 +- .../KernelFusion/work_group_barrier.cpp | 4 +- sycl/test-e2e/KernelFusion/wrapped_usm.cpp | 4 +- sycl/test-e2e/README.md | 1 + 43 files changed, 276 insertions(+), 80 deletions(-) create mode 100644 sycl/test-e2e/KernelFusion/diamond_shape_local.cpp create mode 100644 sycl/test-e2e/KernelFusion/existing_local_accessor.cpp diff --git a/sycl/test-e2e/KernelFusion/abort_fusion.cpp b/sycl/test-e2e/KernelFusion/abort_fusion.cpp index 28f5e0d72b5b5..e0a5858b12250 100644 --- a/sycl/test-e2e/KernelFusion/abort_fusion.cpp +++ b/sycl/test-e2e/KernelFusion/abort_fusion.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test fusion being aborted: Different scenarios causing the JIT compiler diff --git a/sycl/test-e2e/KernelFusion/abort_internalization.cpp b/sycl/test-e2e/KernelFusion/abort_internalization.cpp index b314bf3a0d64d..fdb7b3fa6b193 100644 --- a/sycl/test-e2e/KernelFusion/abort_internalization.cpp +++ b/sycl/test-e2e/KernelFusion/abort_internalization.cpp @@ -1,9 +1,11 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out -// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -O2 -fsycl-embed-ir %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\ +// RUN: %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER -// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\ +// RUN: %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test incomplete internalization: Different scenarios causing the JIT compiler diff --git a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp index c3d49cea3c1ab..d05d53e19c68c 100644 --- a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp +++ b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized" // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized" -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test pointers being stored are not internalized. diff --git a/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp b/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp index 8ff486812b921..3dee82d36006d 100644 --- a/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp +++ b/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with local internalization and a combination of kernels diff --git a/sycl/test-e2e/KernelFusion/buffer_internalization.cpp b/sycl/test-e2e/KernelFusion/buffer_internalization.cpp index 102441ed1b8c8..4be40a9389a2f 100644 --- a/sycl/test-e2e/KernelFusion/buffer_internalization.cpp +++ b/sycl/test-e2e/KernelFusion/buffer_internalization.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with private internalization specified on the diff --git a/sycl/test-e2e/KernelFusion/cancel_fusion.cpp b/sycl/test-e2e/KernelFusion/cancel_fusion.cpp index 6c94f9902579a..9dc5ebe2c007c 100644 --- a/sycl/test-e2e/KernelFusion/cancel_fusion.cpp +++ b/sycl/test-e2e/KernelFusion/cancel_fusion.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test cancel fusion diff --git a/sycl/test-e2e/KernelFusion/complete_fusion.cpp b/sycl/test-e2e/KernelFusion/complete_fusion.cpp index 0ffeca17a5e78..67c2fb3d05ec9 100644 --- a/sycl/test-e2e/KernelFusion/complete_fusion.cpp +++ b/sycl/test-e2e/KernelFusion/complete_fusion.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion without any internalization diff --git a/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp b/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp index 91bd1622a5a5f..c7dc498ce2dd7 100644 --- a/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp +++ b/sycl/test-e2e/KernelFusion/device_info_descriptor.cpp @@ -1,7 +1,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// XFAIL: cuda || hip +// XFAIL: hip // REQUIRES: fusion // Test correct return from device information descriptor. diff --git a/sycl/test-e2e/KernelFusion/diamond_shape.cpp b/sycl/test-e2e/KernelFusion/diamond_shape.cpp index 54a513fbfc71a..73ce385e1c848 100644 --- a/sycl/test-e2e/KernelFusion/diamond_shape.cpp +++ b/sycl/test-e2e/KernelFusion/diamond_shape.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with private internalization specified on the diff --git a/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp b/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp new file mode 100644 index 0000000000000..359c86086e8d7 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp @@ -0,0 +1,111 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: hip +// REQUIRES: fusion + +// Test complete fusion with local internalization specified on the +// accessors for a combination of four kernels, forming a diamond-like shape and +// repeating one of the kernels. + +#include + +using namespace sycl; + +struct AddKernel { + accessor accIn1; + accessor accIn2; + accessor accOut; + + void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; } +}; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp1[dataSize], + tmp2[dataSize], tmp3[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp1[i] = -1; + tmp2[i] = -1; + tmp3[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp1{ + tmp1, + range{dataSize}, + {sycl::ext::codeplay::experimental::property::promote_local{}}}; + buffer bTmp2{ + tmp2, + range{dataSize}, + {sycl::ext::codeplay::experimental::property::promote_local{}}}; + buffer bTmp3{ + tmp3, + range{dataSize}, + {sycl::ext::codeplay::experimental::property::promote_local{}}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp1 = bTmp1.get_access(cgh); + cgh.parallel_for(nd_range<1>{{dataSize}, {16}}, + AddKernel{accIn1, accIn2, accTmp1}); + }); + + q.submit([&](handler &cgh) { + auto accTmp1 = bTmp1.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accTmp2 = bTmp2.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {16}}, + [=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp1 = bTmp1.get_access(cgh); + auto accTmp3 = bTmp3.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {16}}, + [=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp2 = bTmp2.get_access(cgh); + auto accTmp3 = bTmp3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for(nd_range<1>{{dataSize}, {16}}, + AddKernel{accTmp2, accTmp3, accOut}); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i + i * 25) && "Computation error"); + assert(tmp1[i] == -1 && "tmp1 not internalized"); + assert(tmp2[i] == -1 && "tmp2 not internalized"); + assert(tmp3[i] == -1 && "tmp3 not internalized"); + } + + return 0; +} diff --git a/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp b/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp index 1f4771d7d5b0d..aff44212531d8 100644 --- a/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp +++ b/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion, aspect-usm_shared_allocations // Test validity of events after cancel_fusion. diff --git a/sycl/test-e2e/KernelFusion/event_wait_complete.cpp b/sycl/test-e2e/KernelFusion/event_wait_complete.cpp index cf48a9c1ced43..20e547a09f64e 100644 --- a/sycl/test-e2e/KernelFusion/event_wait_complete.cpp +++ b/sycl/test-e2e/KernelFusion/event_wait_complete.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion, aspect-usm_shared_allocations // Test validity of events after complete_fusion. diff --git a/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp b/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp new file mode 100644 index 0000000000000..a35473cc1fd51 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp @@ -0,0 +1,78 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: hip +// REQUIRES: fusion + +// Test complete fusion with local internalization and an local accessor that +// already exists in one of the input kernels. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + local_accessor accLocal{16, cgh}; + cgh.parallel_for( + nd_range<1>{{dataSize}, {16}}, [=](nd_item<1> i) { + size_t globalIdx = i.get_global_linear_id(); + size_t localIdx = i.get_local_linear_id(); + accLocal[localIdx] = accIn2[globalIdx]; + accTmp[globalIdx] = accIn1[globalIdx] + accLocal[localIdx]; + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {16}}, + [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp b/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp index 22e3f5ba2b34d..3277ba5ee6f52 100644 --- a/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp +++ b/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion, aspect-usm_shared_allocations // Test complete fusion where one kernel in the fusion list specifies an diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp index ba155e0e5ffac..2ebccb626ef23 100644 --- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test internalization of a nested array type. diff --git a/sycl/test-e2e/KernelFusion/internalize_deep.cpp b/sycl/test-e2e/KernelFusion/internalize_deep.cpp index 4a0c32e2683b6..2585728259499 100644 --- a/sycl/test-e2e/KernelFusion/internalize_deep.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_deep.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with internalization of a deep struct type. diff --git a/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp b/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp index 9455cad86900a..bd521155be6ed 100644 --- a/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with private internalization specified on the diff --git a/sycl/test-e2e/KernelFusion/internalize_vec.cpp b/sycl/test-e2e/KernelFusion/internalize_vec.cpp index 765036ec98d9c..0536a8f15216a 100644 --- a/sycl/test-e2e/KernelFusion/internalize_vec.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_vec.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with internalization of a struct type. diff --git a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp index bf8f177a24904..1a130a404ad71 100644 --- a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with private internalization specified on the diff --git a/sycl/test-e2e/KernelFusion/jit_caching.cpp b/sycl/test-e2e/KernelFusion/jit_caching.cpp index 617b81bb4b48c..7c00a7cc61eee 100644 --- a/sycl/test-e2e/KernelFusion/jit_caching.cpp +++ b/sycl/test-e2e/KernelFusion/jit_caching.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION" // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION" -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test caching for JIT fused kernels. Also test for debug messages being diff --git a/sycl/test-e2e/KernelFusion/local_internalization.cpp b/sycl/test-e2e/KernelFusion/local_internalization.cpp index b60bd29df394c..508ad08584f4a 100644 --- a/sycl/test-e2e/KernelFusion/local_internalization.cpp +++ b/sycl/test-e2e/KernelFusion/local_internalization.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with local internalization specified on the diff --git a/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp b/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp index 917eda6e090c7..ffd08f918d414 100644 --- a/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp +++ b/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with local internalization specified on the diff --git a/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp b/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp index ffe5178cdaa9c..7c16d212d8485 100644 --- a/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp +++ b/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // This test currently fails because InferAddressSpace is not able to remove all // address-space casts, causing internalization to fail. diff --git a/sycl/test-e2e/KernelFusion/private_internalization.cpp b/sycl/test-e2e/KernelFusion/private_internalization.cpp index bf7490e5cabab..ca0e8fdeb2f97 100644 --- a/sycl/test-e2e/KernelFusion/private_internalization.cpp +++ b/sycl/test-e2e/KernelFusion/private_internalization.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with private internalization specified on the diff --git a/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp b/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp index 7325b26925349..bd2d2d0ea2a40 100644 --- a/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp +++ b/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with private internalization on accessors with different diff --git a/sycl/test-e2e/KernelFusion/struct_with_array.cpp b/sycl/test-e2e/KernelFusion/struct_with_array.cpp index 20d25a0d38f50..f79b51908bb49 100644 --- a/sycl/test-e2e/KernelFusion/struct_with_array.cpp +++ b/sycl/test-e2e/KernelFusion/struct_with_array.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with private internalization on a kernel functor with an diff --git a/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp index 14643a3d81179..440de656d043b 100644 --- a/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp +++ b/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // Test fusion cancellation on an explicit memory operation on an accessor // happening before complete_fusion. diff --git a/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp b/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp index 627a8cdbfe43d..96375f18c41d2 100644 --- a/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp +++ b/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // Test fusion cancellation on buffer destruction happening before // complete_fusion. diff --git a/sycl/test-e2e/KernelFusion/sync_event_wait.cpp b/sycl/test-e2e/KernelFusion/sync_event_wait.cpp index d34393638e9b4..d077116412683 100644 --- a/sycl/test-e2e/KernelFusion/sync_event_wait.cpp +++ b/sycl/test-e2e/KernelFusion/sync_event_wait.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // Test fusion cancellation on event::wait() happening before // complete_fusion. diff --git a/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp b/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp index 854803f34716d..d6f4cdc77456f 100644 --- a/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp +++ b/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // Test fusion cancellation on host accessor creation happening before // complete_fusion. diff --git a/sycl/test-e2e/KernelFusion/sync_host_task.cpp b/sycl/test-e2e/KernelFusion/sync_host_task.cpp index fc94fa9b3d9dc..4c2bc870e2919 100644 --- a/sycl/test-e2e/KernelFusion/sync_host_task.cpp +++ b/sycl/test-e2e/KernelFusion/sync_host_task.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // Test fusion cancellation on host task submission happening before // complete_fusion. diff --git a/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp b/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp index 145fde97b5012..936b486c7741b 100644 --- a/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp +++ b/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // Test fusion cancellation on queue destruction happening before // complete_fusion. diff --git a/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp b/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp index 5fe768d60c551..71996ffed8cf8 100644 --- a/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp +++ b/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // Test fusion cancellation on queue::wait() happening before // complete_fusion. diff --git a/sycl/test-e2e/KernelFusion/sync_second_queue.cpp b/sycl/test-e2e/KernelFusion/sync_second_queue.cpp index 057c96935361b..5147a639196c0 100644 --- a/sycl/test-e2e/KernelFusion/sync_second_queue.cpp +++ b/sycl/test-e2e/KernelFusion/sync_second_queue.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // Test fusion cancellation on submission of kernel with requirements to a // different queue happening before complete_fusion. diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp index bb33fcdcb8337..a2c9caa88cabc 100644 --- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp +++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp @@ -3,7 +3,7 @@ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // For this test, complete_fusion must be supported. // REQUIRES: fusion diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp index 3ca9015c7ee22..d3526f2aba029 100644 --- a/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp +++ b/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // For this test, complete_fusion must be supported. // REQUIRES: fusion diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp index 67af367316fa7..270645af15b40 100644 --- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp +++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp @@ -1,9 +1,9 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %CPU_CHECK_PLACEHOLDER // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // Test fusion cancellation on an explicit memory operation on an USM pointer // happening before complete_fusion. @@ -61,6 +61,10 @@ int main() { fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + for (size_t i = 0; i < dataSize; ++i) { + std::cout << out[i] << ", "; + } + std::cout << "\n"; // Check the results for (size_t i = 0; i < dataSize; ++i) { assert(out[i] == (20 * i * i) && "Computation error"); diff --git a/sycl/test-e2e/KernelFusion/three_dimensional.cpp b/sycl/test-e2e/KernelFusion/three_dimensional.cpp index db0ea25ca3077..805f7f88d782d 100644 --- a/sycl/test-e2e/KernelFusion/three_dimensional.cpp +++ b/sycl/test-e2e/KernelFusion/three_dimensional.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with private internalization specified on the diff --git a/sycl/test-e2e/KernelFusion/two_dimensional.cpp b/sycl/test-e2e/KernelFusion/two_dimensional.cpp index 2eafb1c1ccc0a..30359b4713dab 100644 --- a/sycl/test-e2e/KernelFusion/two_dimensional.cpp +++ b/sycl/test-e2e/KernelFusion/two_dimensional.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O2 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir -O2 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with private internalization specified on the diff --git a/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp b/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp index 13290d06e25f1..85a19c4122202 100644 --- a/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp +++ b/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion, aspect-usm_shared_allocations // Test complete fusion using USM pointers. diff --git a/sycl/test-e2e/KernelFusion/work_group_barrier.cpp b/sycl/test-e2e/KernelFusion/work_group_barrier.cpp index 7141c37be8987..5d7e1a6f16806 100644 --- a/sycl/test-e2e/KernelFusion/work_group_barrier.cpp +++ b/sycl/test-e2e/KernelFusion/work_group_barrier.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion // Test complete fusion with a combination of kernels that require a work-group diff --git a/sycl/test-e2e/KernelFusion/wrapped_usm.cpp b/sycl/test-e2e/KernelFusion/wrapped_usm.cpp index aa112ca064ec0..a46c9199b31de 100644 --- a/sycl/test-e2e/KernelFusion/wrapped_usm.cpp +++ b/sycl/test-e2e/KernelFusion/wrapped_usm.cpp @@ -1,7 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip // REQUIRES: fusion, aspect-usm_shared_allocations // Test complete fusion using an wrapped USM pointer as kernel functor argument. diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md index b8dfb34406788..1a751973eaea3 100644 --- a/sycl/test-e2e/README.md +++ b/sycl/test-e2e/README.md @@ -183,6 +183,7 @@ unavailable. * **dump_ir**: - compiler can / cannot dump IR; * **llvm-spirv** - llvm-spirv tool availability; * **llvm-link** - llvm-link tool availability; + * **fusion**: - Runtime supports kernel fusion; ## llvm-lit parameters From a8afe1dac295c1afa135f513dee08364f85c0075 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 5 Apr 2023 15:34:43 +0100 Subject: [PATCH 19/25] Address more PR feedback Signed-off-by: Lukas Sommer --- .../jit-compiler/lib/translation/KernelTranslation.cpp | 3 +++ sycl-fusion/passes/target/TargetFusionInfo.cpp | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 395cd64f0f913..e1435870eb3b2 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -220,6 +220,9 @@ llvm::Expected KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod, JITContext &JITCtx) { #ifndef FUSION_JIT_SUPPORT_PTX + (void)KernelInfo; + (void)Mod; + (void)JITCtx; return createStringError(inconvertibleErrorCode(), "PTX translation not supported in this build"); #else // FUSION_JIT_SUPPORT_PTX diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp index 2e534c35037fd..e6e15a07e28e8 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.cpp +++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp @@ -49,6 +49,8 @@ class TargetFusionInfoImpl { llvm::Module *LLVMMod; }; +namespace { + // // SPIRVTargetFusionInfo // @@ -282,6 +284,8 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { }; #endif // FUSION_JIT_SUPPORT_PTX +} // anonymous namespace + // // TargetFusionInfo // From f7df423d473131d18e38b74a7ab7b71608b492dc Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 5 Apr 2023 15:34:59 +0100 Subject: [PATCH 20/25] Add test for kernel fusion with math function Signed-off-by: Lukas Sommer --- sycl/test-e2e/KernelFusion/math_function.cpp | 64 ++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 sycl/test-e2e/KernelFusion/math_function.cpp diff --git a/sycl/test-e2e/KernelFusion/math_function.cpp b/sycl/test-e2e/KernelFusion/math_function.cpp new file mode 100644 index 0000000000000..f2dc1541f69cb --- /dev/null +++ b/sycl/test-e2e/KernelFusion/math_function.cpp @@ -0,0 +1,64 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: hip +// REQUIRES: fusion + +// Test fusion of a kernel using a math function. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + float in1[dataSize], in2[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = 1; + in2[i] = i * 3; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = sycl::cospi(accIn1[i]); }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn2[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (-1.0 * static_cast(i*3)) && "Computation error"); + } + + return 0; +} From bc32fad74fd81d753be7c29f55a303e5ec080fbe Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 5 Apr 2023 15:35:26 +0100 Subject: [PATCH 21/25] Document CUDA kernel fusion in design documentation Signed-off-by: Lukas Sommer --- sycl/doc/design/CompilerAndRuntimeDesign.md | 23 ++ .../doc/design/images/DevicePTXProcessing.svg | 228 ++++++++---------- 2 files changed, 124 insertions(+), 127 deletions(-) diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md index 3db49acbd36b1..f06b0d52c257d 100644 --- a/sycl/doc/design/CompilerAndRuntimeDesign.md +++ b/sycl/doc/design/CompilerAndRuntimeDesign.md @@ -758,6 +758,29 @@ entry: Note: Kernel naming is not fully stable for now. +##### Kernel Fusion Support + +The [experimental kernel fusion +extension](../extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc) +also supports the CUDA backend. However, as neither CUBIN nor PTX are a suitable +input format for the [kernel fusion JIT compiler](KernelFusionJIT.md), a +suitable IR has to be added as an additional device binary. + +Therefore, in case kernel fusion should be performed for the CUDA backend, the +user needs to specify the additional flag `-fsycl-embed-ir` during compilation, +to add LLVM IR as an additional device binary. When the flag `-fsycl-embed-ir` +is specified, the LLVM IR produced by Clang for the CUDA backend device +compilation is added to the fat binary file. To this end, the resulting +file-table from `sycl-post-link` is additionally passed to the +`clang-offload-wrapper`, creating a wrapper object with target `llvm_nvptx64`. + +This device binary in LLVM IR format can be retrieved by the SYCL runtime and +used by the kernel fusion JIT compiler. The resulting fused kernel is compiled +to PTX assembly by the kernel fusion JIT compiler at runtime. + +Note that the device binary in LLVM IR does not replace the device binary in +CUBIN/PTX format, but is embed in addition to it. + ### Integration with SPIR-V format This section explains how to generate SPIR-V specific types and operations from diff --git a/sycl/doc/design/images/DevicePTXProcessing.svg b/sycl/doc/design/images/DevicePTXProcessing.svg index df690ec5fb08b..79a9c5e5c4fc9 100644 --- a/sycl/doc/design/images/DevicePTXProcessing.svg +++ b/sycl/doc/design/images/DevicePTXProcessing.svg @@ -1,20 +1,20 @@ + width="205.79753mm" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns="http://www.w3.org/2000/svg" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:dc="http://purl.org/dc/elements/1.1/"> + + + + inkscape:snap-global="false" + inkscape:showpageshadow="2" + inkscape:pagecheckerboard="0" + inkscape:deskcolor="#d1d1d1" /> @@ -2318,16 +2335,14 @@ d="m 125.31238,20.80355 0.32455,-1.291166 1.29117,-0.324556 z" /> - File table - + y="16.988504">File table - Clang - + y="84.249756">Clang @@ -2495,13 +2508,12 @@ - clang - + x="0 7.7220001 15.444 23.166 30.94416">clang - - - + x="0">- - offload - + x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.528561">offload - - - + x="0">- - wrapper - + x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.458359">wrapper - PTX target processing - + y="60.97049">PTX target processing @@ -2697,13 +2701,12 @@ - Wrapper object - + x="0 12.11652 16.79184 23.517 30.831841 38.146679 45.138599 50.038559 53.141399 60.540482 67.911484 71.267036 78.258957 84.099602">Wrapper object - Device code - + x="0 8.6493597 15.484464 21.841393 25.076113 30.968927 37.930607 41.151264 46.917503 54.329231 61.712833">Device code - (from sycl-post-link) - + y="17.476978">(from sycl-post-link) @@ -2814,13 +2814,11 @@ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Calibri;-inkscape-font-specification:'Calibri, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#404040;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778" id="text899-9" x="73.031509" - y="68.195061"> - libspirv.bc - + sodipodi:role="line">libspirv.bc - libdevice.bc - + sodipodi:role="line">libdevice.bc - ptxas - + y="113.47669">ptxas @@ -2964,13 +2958,11 @@ id="text1309-7" style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.953px;font-family:Consolas;-inkscape-font-specification:'Consolas, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778" x="102.06758" - y="137.37424"> - fatbin - + y="137.37424">fatbin - ptx - + id="tspan3794">ptx - cubin - + id="tspan3916">cubin - LLVM IR - + y="73.327454">LLVM IR - CUDA fatbin - + id="tspan2303">CUDA fatbin - (to host linker) - + y="216.68318">(to host linker) - (nvptx backend) - + y="88.973877">(nvptx backend) - file-table-tform - + y="33.349266">file-table-tform - (Copy "Code") - + y="37.806171">(Copy "Code") - LLVM IR - + y="47.484673">LLVM IR - file-table-tform - + y="161.4454">file-table-tform - (Replace "Code") - + y="165.9023">(Replace "Code") - File table - + id="tspan2303-1">File table + From b4d39684972607d89b25549af6396ac5278d51b9 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 18 Apr 2023 13:02:35 +0100 Subject: [PATCH 22/25] Update kernel fusion design document Signed-off-by: Lukas Sommer --- sycl/doc/design/KernelFusionJIT.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md index 6678b9359f832..dc458c4f2b71a 100644 --- a/sycl/doc/design/KernelFusionJIT.md +++ b/sycl/doc/design/KernelFusionJIT.md @@ -162,11 +162,20 @@ The metadata is attached to a function that will become the fused kernel: ### Support for non SPIR-V targets -Non SPIR-V targets (NVPTX / AMDGCN) are not supported at the moment as they cannot ingest a SPIR-V module. However, we are looking into adding support for these targets once the initial SPIR-V based path is operational. +Fusion is currently supported for the NVPTX/CUDA backend. -In this scenario, two options are possible to add JIT support: +As this backend cannot ingest a SPIR-V module, additional changes to the +compilation flow are necessary. During static compilation the LLVM module for +this backend is stored in addition to the finalized binary. - - During static compilation we store the LLVM module on top of the finalized binary. This behavior could be controlled by a flag to avoid a too important binary inflation. Then, during the fusion process, the JIT will load that LLVM IR and finalize the fused kernel to the final target as driven by the PI plugin. - - SPIR-V ingestion support is added for these targets. The module to be loaded could then be the generic SPIR-V module. This path would however exclude target specific optimizations written in user's code. The current state of the SPIR-V translator does not allow this at the moment and significant work is needed to add this support. +This behavior is controlled by the `-fsycl-embed-ir` flag to avoid binary +inflation in case kernel fusion is not used. If users want to use kernel fusion +at runtime on the NVPTX/CUDA backend, they need to pass the `-fsycl-embed-ir` +flag during static compilation. -In these cases, PI will need to be extended to allow to somehow drive the JIT process, so it is tailored to the plugin target needs. +During the fusion process at runtime , the JIT will load the LLVM IR and +finalize the fused kernel to the final target. More information is available +[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support). + +Support for the AMD GPU/HIP/AMDGCN backend is not yet implemented, but could +follow an approach similar to the NVPTX/CUDA backend. From a7e13694e3b2418274c3f73f1ef51a50a905e00d Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Thu, 20 Apr 2023 16:56:57 +0100 Subject: [PATCH 23/25] Fix formatting for test Signed-off-by: Lukas Sommer --- sycl/test-e2e/KernelFusion/math_function.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/KernelFusion/math_function.cpp b/sycl/test-e2e/KernelFusion/math_function.cpp index f2dc1541f69cb..ef9357672977e 100644 --- a/sycl/test-e2e/KernelFusion/math_function.cpp +++ b/sycl/test-e2e/KernelFusion/math_function.cpp @@ -4,7 +4,7 @@ // UNSUPPORTED: hip // REQUIRES: fusion -// Test fusion of a kernel using a math function. +// Test fusion of a kernel using a math function. #include @@ -57,7 +57,7 @@ int main() { // Check the results for (size_t i = 0; i < dataSize; ++i) { - assert(out[i] == (-1.0 * static_cast(i*3)) && "Computation error"); + assert(out[i] == (-1.0 * static_cast(i * 3)) && "Computation error"); } return 0; From 88b4ada8df28d99de8daba1874f5c9e23d66f596 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Thu, 27 Apr 2023 13:35:59 +0100 Subject: [PATCH 24/25] Rebase on branch 'sycl' Signed-off-by: Lukas Sommer --- sycl/source/detail/jit_compiler.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index ae20134d46b8c..ed64e35c06509 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -40,7 +40,7 @@ translateBinaryImageFormat(pi::PiDeviceBinaryType Type) { } ::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) { - auto Backend = Queue->getDeviceImplPtr()->getPlugin().getBackend(); + auto Backend = Queue->getDeviceImplPtr()->getBackend(); switch (Backend) { case backend::ext_oneapi_level_zero: case backend::opencl: @@ -58,8 +58,8 @@ std::pair retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { auto KernelName = KernelCG->getKernelName(); - bool isNvidia = Queue->getDeviceImplPtr()->getPlugin().getBackend() == - backend::ext_oneapi_cuda; + bool isNvidia = + Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda; if (isNvidia) { auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName); std::vector KernelIds{KernelID}; From 4877a40089e5f7825297e0f8c40cbc4e35d0316c Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Thu, 4 May 2023 13:28:42 +0100 Subject: [PATCH 25/25] Address PR feedback and formatting Signed-off-by: Lukas Sommer --- sycl/doc/design/KernelFusionJIT.md | 2 +- sycl/source/detail/device_info.hpp | 16 ++++++++++++---- sycl/source/detail/scheduler/commands.hpp | 9 ++++++--- sycl/source/detail/scheduler/graph_builder.cpp | 1 - 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md index dc458c4f2b71a..53367b5a930b3 100644 --- a/sycl/doc/design/KernelFusionJIT.md +++ b/sycl/doc/design/KernelFusionJIT.md @@ -173,7 +173,7 @@ inflation in case kernel fusion is not used. If users want to use kernel fusion at runtime on the NVPTX/CUDA backend, they need to pass the `-fsycl-embed-ir` flag during static compilation. -During the fusion process at runtime , the JIT will load the LLVM IR and +During the fusion process at runtime, the JIT will load the LLVM IR and finalize the fused kernel to the final target. More information is available [here](./CompilerAndRuntimeDesign.md#kernel-fusion-support). diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 1a34178a4a3c2..85cedb47e6e92 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -105,10 +105,18 @@ affinityDomainToString(info::partition_affinity_domain AffinityDomain) { } // Mapping expected SYCL return types to those returned by PI calls -template struct sycl_to_pi { using type = T; }; -template <> struct sycl_to_pi { using type = pi_bool; }; -template <> struct sycl_to_pi { using type = RT::PiDevice; }; -template <> struct sycl_to_pi { using type = RT::PiPlatform; }; +template struct sycl_to_pi { + using type = T; +}; +template <> struct sycl_to_pi { + using type = pi_bool; +}; +template <> struct sycl_to_pi { + using type = RT::PiDevice; +}; +template <> struct sycl_to_pi { + using type = RT::PiPlatform; +}; // Mapping fp_config device info types to the values used to check fp support template struct check_fp_support {}; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 641345aa74353..5f5f68ed1bf56 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -290,12 +290,15 @@ class Command { // Memory is allocated in this method and released in destructor. void copySubmissionCodeLocation(); - /// Clear all dependency events for device and host dependencies. This should - /// only be used if a command is about to be deleted without being executed - /// before that. + /// Clear all dependency events This should only be used if a command is about + /// to be deleted without being executed before that. As of now, the only + /// valid use case for this function is in kernel fusion, where the fused + /// kernel commands are replaced by the fused command without ever being + /// executed. void clearAllDependencies() { MPreparedDepsEvents.clear(); MPreparedHostDepsEvents.clear(); + MDeps.clear(); } /// Contains list of dependencies(edges) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index a404fe707698b..c40b5c0f69fdc 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1395,7 +1395,6 @@ void Scheduler::GraphBuilder::removeNodeFromGraph( Dep.MDepCommand->MUsers.erase(Node); } - Node->MDeps.clear(); // Clear all the dependencies to avoid cleanDepEventsThroughOneLevel, called // from the destructor of the command to delete the dependencies of the // command this command depends on.