diff --git a/.github/workflows/check-amdllpc-docker.yml b/.github/workflows/check-amdllpc-docker.yml index bb9a75f2fe..8f660b2cf8 100644 --- a/.github/workflows/check-amdllpc-docker.yml +++ b/.github/workflows/check-amdllpc-docker.yml @@ -62,13 +62,13 @@ jobs: echo "${{ github.event.pull_request.number }}" > pr_num.txt - name: Upload code coverage report as a GitHub artifact if: contains(matrix.feature-set, '+coverage') && github.event.pull_request.number - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: cov_report_${{ env.CONFIG_TAG }} path: ${{ env.COVERAGE_REPORT_FILES }} - name: Upload the PR number as a GitHub artifact if: contains(matrix.feature-set, '+coverage') && github.event.pull_request.number - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: pr_num path: pr_num.txt diff --git a/cmake/continuations.cmake b/cmake/continuations.cmake deleted file mode 100644 index f13118c443..0000000000 --- a/cmake/continuations.cmake +++ /dev/null @@ -1,34 +0,0 @@ -## - ####################################################################################################################### - # - # Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved. - # - # Permission is hereby granted, free of charge, to any person obtaining a copy - # of this software and associated documentation files (the "Software"), to - # deal in the Software without restriction, including without limitation the - # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - # sell copies of the Software, and to permit persons to whom the Software is - # furnished to do so, subject to the following conditions: - # - # The above copyright notice and this permission notice shall be included in all - # copies or substantial portions of the Software. - # - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - # IN THE SOFTWARE. - # - ####################################################################################################################### - -set(LLPC_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/..") - -include("${LLPC_SOURCE_DIR}/cmake/llvmraytracing.cmake") - -# Deprecated transition macro for refactoring transition; use add_llvmraytracing_projects instead -macro(add_continuations_projects) - add_llvmraytracing_projects() - set(LLPC_RAYTRACING_ADD_TRANSITION_TARGETS ON) -endmacro() diff --git a/compilerutils/CMakeLists.txt b/compilerutils/CMakeLists.txt index 4aa8824093..5e6eb5b0b0 100644 --- a/compilerutils/CMakeLists.txt +++ b/compilerutils/CMakeLists.txt @@ -17,6 +17,10 @@ add_llvm_library(LLVMCompilerUtils lib/DxilToLlvm.cpp lib/TypeLowering.cpp lib/TypesMetadata.cpp + lib/ValueOriginTracking.cpp + lib/ValueOriginTrackingTestPass.cpp + lib/ValueSpecialization.cpp + lib/ValueSpecializationTestPass.cpp DEPENDS intrinsics_gen diff --git a/compilerutils/include/compilerutils/CompilerUtils.h b/compilerutils/include/compilerutils/CompilerUtils.h index e273499d47..207df2eef5 100644 --- a/compilerutils/include/compilerutils/CompilerUtils.h +++ b/compilerutils/include/compilerutils/CompilerUtils.h @@ -118,6 +118,8 @@ class CrossModuleInliner { // target module. llvm::GlobalValue *findCopiedGlobal(llvm::GlobalValue &sourceGv, llvm::Module &targetModule); + static std::string getCrossModuleName(llvm::GlobalValue &gv); + private: // Checks that we haven't processed a different target module earlier. void checkTargetModule(llvm::Module &targetModule) { diff --git a/compilerutils/include/compilerutils/TypesMetadata.h b/compilerutils/include/compilerutils/TypesMetadata.h index 2e319de7c2..52b0563f1c 100644 --- a/compilerutils/include/compilerutils/TypesMetadata.h +++ b/compilerutils/include/compilerutils/TypesMetadata.h @@ -65,7 +65,7 @@ class TypedFuncTy { // Construct a TypedFuncTy for the given result type and arg types. // This constructs the !pointeetys metadata; that can then be attached to a function // using writeMetadata(). - TypedFuncTy(TypedArgTy ResultTy, ArrayRef ArgTys); + TypedFuncTy(TypedArgTy ResultTy, ArrayRef ArgTys, bool IsVarArg = false); // Get a TypedFuncTy for the given Function, looking up the !pointeetys metadata. static TypedFuncTy get(const Function *F); diff --git a/compilerutils/include/compilerutils/ValueOriginTracking.h b/compilerutils/include/compilerutils/ValueOriginTracking.h new file mode 100644 index 0000000000..6e3d9215d4 --- /dev/null +++ b/compilerutils/include/compilerutils/ValueOriginTracking.h @@ -0,0 +1,275 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file ValueOriginTracking.h + * @brief Helpers for tracking the byte-wise origin of SSA values. + * + * @details + * Sometimes we are interested in the byte-wise contents of a value. + * If the value is a constant, this can be determined with standard LLVM helpers like computeKnownBits, + * but even if the value is dynamic it can be helpful to trace where these bytes come from. + * + * For instance, if some outgoing function arguments de-facto preserve incoming function arguments in the same argument + * slot, then this information may be used to enable certain inter-procedural optimizations. + * + * This file provides helpers for such an analysis. + * It can be thought of splitting values into "slices" (e.g. bytes or dwords), and performing an analysis of where + * these values come from, propagating through things like {insert,extract}{value,element}. + * Using single-byte slices results in a potentially more accurate analysis, but has higher runtime cost. + * For every value, the analysis works on the in-memory layout of its type, including padding, even though we analyze + * only SSA values that might end up in registers. + * It can be thought of as describing the memory obtained from storing a value to memory. + * + * In that sense, it is similar to how SROA splits up allocas into ranges, and analyses ranges separately. + * However, we only track contents of SSA values, and do not propagate through memory, and thus generally + * SROA should have been run before to eliminate non-necessary memory operations. + * + * If the client code has extra information on the origin of some intermediate values that this analysis cannot reason + * about, e.g. calls to special functions, or special loads, then it can provide this information in terms of + * assumptions, which use the same format as the analysis result, mapping slices of a value to slices of other values or + * constants. When analyzing a value with an assumption on it, the algorithm then applies the analysis result for + * values referenced by assumptions, and propagates the result through following instructions. + * + * The analysis does not modify functions, however, as part of the analysis, additional constants may be created. + * + * The motivating application that we have implemented this for is propagating constant known arguments into the + * Traversal shader in continuations-based ray tracing: + * + * The Traversal shader is enqueued by potentially multiple call sites in RayGen (RGS), Closest-Hit (CHS) or Miss (MS) + * shaders. If all these call sites share some common constant arguments (e.g. on the ray payload), then we may + * want to propagate these constants into the Traversal shader to reduce register pressure. + * On these call sites, a simple analysis based on known constant values suffices. + * + * However, the Traversal shader is re-entrant, and may enqueue itself. Also, with Any-Hit (AHS) and/or Intersection + * (IS) shaders in the pipeline, these shaders are enqueued by Traversal, which in turn re-enqueue Traversal. + * + * Thus, in order to prove that incoming arguments of the Traversal shader are known constants, we need to prove + * that all TraceRay call sites share these constants, *and* that all functions that might re-enqueue Traversal + * (Traversal itself, AHS, IS) preserve these arguments, or set it to the same constant. + * + * This analysis allows all of the above: It allows to prove that certain outgoing arguments at TraceRay call sites + * have a specific constant value, and allow to prove that outgoing arguments of Traversal/AHS/IS preserve the + * corresponding incoming ones, or more precisely, that argument slots are preserved. + * Because we track on a fine granularity (e.g. dwords), we might be able to prove that parts of a struct argument are + * preserved even if some fields of it are changed. + * + *********************************************************************************************************************** + */ + +#pragma once + +#include +#include +#include + +namespace llvm { +class raw_ostream; +class Constant; +class DataLayout; +class Function; +class Instruction; +class Value; +} // namespace llvm + +namespace CompilerUtils { + +namespace ValueTracking { + +// enum wrapper with some convenience helpers for common operations. +// The contained value is a bitmask of status, and thus multiple status can be set. +// In that case we know that at run time, one of the status holds, but we don't know which one. +// This can occur with phi nodes and select instructions. +// In the common cases, just a single bit is set though. +struct SliceStatus { + // As the actual enum is contained within the struct, its values don't leak into the containing namespace, + // and it's not possible to implicitly cast a SliceStatus to an int, so it's as good as an enum class. + enum StatusEnum : uint32_t { Constant = 0x1, Dynamic = 0x2, UndefOrPoison = 0x4 }; + StatusEnum S = {}; + + SliceStatus(StatusEnum S) : S{S} {} + + static SliceStatus makeEmpty() { return static_cast(0); } + + // Returns whether all status bits set in other are also set in us. + bool contains(SliceStatus Other) const { return (*this & Other) == Other; } + + // Returns whether no status bits are set. + bool isEmpty() const { return static_cast(S) == 0; } + + // Returns whether there is exactly one status bit set. Returns false for an empty status. + bool isSingleStatus() const { + auto AsInt = static_cast(S); + return (AsInt != 0) && (((AsInt - 1) & AsInt) == 0); + } + + SliceStatus operator&(SliceStatus Other) const { return static_cast(S & Other.S); } + + SliceStatus operator|(SliceStatus Other) const { return static_cast(S | Other.S); } + + bool operator==(SliceStatus Other) const { return S == Other.S; } + bool operator!=(SliceStatus Other) const { return !(S == Other.S); } +}; + +static constexpr unsigned MaxSliceSize = 4; // Needed for SliceInfo::ConstantValue + +// A slice consists of a consecutive sequence of bytes within the representation of a value. +// We keep track of a potential constant value, and a potential dynamic value that determines +// the byte representation of our slice. +// If both dynamic and constant values are set, then one of them determines the byte representation +// of our slice, but we don't know which. +// If just a single value is set, then we know that that one determines us. +// +// Allowing both a dynamic and a constant value is intended to allow patterns where a value +// is either a constant, or a passed-through argument. If the constant matches the values used +// to initialize the incoming argument on the caller side, then we can still prove that the value +// is in fact constant. +// +// If the bit width of a value is not a multiple of the slice size, the last slice contains +// unspecified high bits. These are not guaranteed to be zeroed out. +struct SliceInfo { + SliceInfo(SliceStatus S) : Status{S} {} + void print(llvm::raw_ostream &OS, bool Compact = false) const; + + // Enum-bitmask of possible status of the value. + SliceStatus Status = SliceStatus::makeEmpty(); + uint32_t ConstantValue = 0; + static_assert(sizeof(ConstantValue) >= MaxSliceSize); + // If set, the byte representation of this slice is obtained + // from the given value at the given offset. + llvm::Value *DynamicValue = nullptr; + unsigned DynamicValueByteOffset = 0; +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SliceInfo &BI); + +// Combines slice infos for a whole value, unless the value is too large, in which case it might be cut off. +// It is up to client code to detect missing slice infos at the value tail if that is relevant, +// e.g. in order to prove that all bytes in a value match some assumption. +struct ValueInfo { + void print(llvm::raw_ostream &OS, bool Compact = false) const; + + // Infos for the byte-wise representation of a value, partitioned into consecutive slices + llvm::SmallVector Slices; +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const ValueInfo &VI); + +} // namespace ValueTracking + +// Utility class to track the origin of values, partitioned into slices of e.g. 1 or 4 bytes each. +// See the documentation at the top of this file for details. +// +// The status of each slice is given by its SliceStatus. +// If the size of a value exceeds MaxBytesPerValue, then only a prefix of that size is analyzed. +// This ensures bounded runtime and memory consumption on pathological cases with huge values. +// +// This is intended to be used for interprocedural optimizations, detecting cases where arguments are initialized with a +// constant and then always propagated, allowing to replace the argument by the initial constant. +class ValueOriginTracker { +public: + using ValueInfo = ValueTracking::ValueInfo; + // In some cases, client code has additional information on where values originate from, or + // where they should be assumed to originate from just for the purpose of the analysis. + // For instance, if a value is spilled and then re-loaded, value origin tracking + // would consider the reloaded value as unknown dynamic, because it doesn't track memory. + // Value origin assumptions allow the client to provide such extra information. + // For each registered value, when the analysis reaches the given value, it will instead rely on the supplied + // ValueInfo, and replace dynamic references by the analysis result for these dynamic values. + // This means that when querying values for which assumptions were given, it is *not* ensured that + // the exact assumptions are returned. + // + // Consider this example using dword slices: + // %equals.3 = add i32 3, 0 + // %unknown = call i32 @opaque() + // %arr.0 = insertvalue [3 x i32] poison, i32 %equals.3, 0 + // %arr.1 = insertvalue [3 x i32] %arr.0, i32 %unknown, 1 + // %arr.stored = insertvalue [3 x i32] %arr.1, i32 %unknown, 2 + // store [3 x i32] %arr.stored, ptr %ptr + // %reloaded = load [3 x i32], ptr %ptr + // We supply the assumption that the first two dwords of %reloaded are in fact the first two dwords of + // %arr.stored, and that the third dword equals 7 (because we have some additional knowledge somehow). + // Then, when querying %reloaded, the result will be: + // * dword 0: constant: 0x3 (result of the add) + // * dword 1: dynamic: %unknown (offset 0) + // * dword 2: constant: 0x7 + // + // If only some slices are known, the other slices can use the fallback of point to the value itself. + // For values with assumptions, we skip the analysis we'd perform otherwise, so adding assumptions can + // lead to worse analysis results on values that can be analyzed. For now, this feature however + // is intended for values that are otherwise opaque. Support for merging with the standard analysis could be added. + // + // For now, only assumptions on instructions are supported. + // The intended uses of this feature only require it for instructions, and support for non-instructions + // is a bit more complicated but can be added if necessary. + // Also, only a single status on assumptions is allowed. + using ValueOriginAssumptions = llvm::DenseMap; + + ValueOriginTracker(const llvm::DataLayout &DL, unsigned BytesPerSlice = 4, unsigned MaxBytesPerValue = 512, + ValueOriginAssumptions OriginAssumptions = ValueOriginAssumptions{}) + : DL{DL}, BytesPerSlice{BytesPerSlice}, MaxBytesPerValue{MaxBytesPerValue}, + OriginAssumptions(std::move(OriginAssumptions)) {} + + // Computes a value info for the given value. + // If the value has been seen before, returns a cache hit from the ValueInfos map. + // When querying multiple values within the same functions, it is more efficient + // to first run analyzeValues() on all of them together. + ValueInfo getValueInfo(llvm::Value *V); + + // Analyze a set of values in bulk for efficiency. + // Value analysis needs to process whole functions, so analysing multiple values within the same + // function allows to use a single pass for them all. + // The passed values don't have to be instructions, and don't have to be in the same functions, + // although there is no perf benefit in that case. + // Values may contain duplicates. + void analyzeValues(llvm::ArrayRef Values); + +private: + struct ValueInfoBuilder; + const llvm::DataLayout &DL; + unsigned BytesPerSlice = 0; + unsigned MaxBytesPerValue = 0; + ValueOriginAssumptions OriginAssumptions; + llvm::DenseMap ValueInfos; + + // Analyze a value, creating a ValueInfo for it. + // If V is an instruction, this assumes the ValueInfos of dependencies have + // already been created. If some miss, we assume cyclic dependencies and give up + // on this value. + ValueInfo computeValueInfo(llvm::Value *V); + // Same as above, implementing constant analysis + ValueInfo computeConstantValueInfo(ValueInfoBuilder &VIB, llvm::Constant *C); + // Given an origin assumption, compute a value info that combines analysis results + // of the values referenced by the assumption. + ValueInfo computeValueInfoFromAssumption(ValueInfoBuilder &VIB, const ValueInfo &OriginAssumption); + + // Implementation function for analyzeValues(): + // Ensures that the ValueInfos map contains an entry for V, by optionally computing a value info first. + // Then, return a reference to the value info object within the map. + // The resulting reference is invalidated if ValueInfos is mutated. + // Assumes that all values this depends on have already been analyzed, except for phi nodes, + // which are handled pessimistically in case of loops. + ValueInfo &getOrComputeValueInfo(llvm::Value *V, bool KnownToBeNew = false); +}; + +} // namespace CompilerUtils diff --git a/compilerutils/include/compilerutils/ValueSpecialization.h b/compilerutils/include/compilerutils/ValueSpecialization.h new file mode 100644 index 0000000000..e1ca13c231 --- /dev/null +++ b/compilerutils/include/compilerutils/ValueSpecialization.h @@ -0,0 +1,176 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file ValueSpecialization.h + * @brief Helpers for changing the dword-wise representation of a value. + * + * @details + * Utility to replace dwords in the byte-wise representation of generic values by known constants or frozen poison. + * + * This is equivalent to storing a value to an alloca, then replacing some dwords, and then reading the value + * back, but does so without introducing an alloca, and instead directly working on the SSA value using + * {insert,extract}{value,element} instructions, and bit-wise ops for 64-bit values. + * + * Replacements are not guaranteed to succeed in the general case. Unsupported cases include: + * * dwords covering scalars smaller than a dword (e.g. i16) + * * dwords covering non-dword-aligned scalars + * + * Thus, this helper is intended for cases where we do not rely on the replacement for functional correctness, + * but instead apply it as an optimization, e.g. for constant propagation, and prefer to do that + * without introducing an alloca. This application motivates the name: The value is specialized for known + * constant contents when used in a particular context. + * + * If needed, the mechanism could be extended to allow replacement of dwords by dynamic values. + * + *********************************************************************************************************************** + */ + +#pragma once + +#include +#include +#include + +namespace llvm { +class DataLayout; +class Value; +class Module; +class StringRef; +} // namespace llvm + +namespace CompilerUtils { + +class ValueSpecializer { +public: + enum class SpecializationKind { + None = 0, // Keep the dword in the value as-is. + Constant, // Replace by a constant. + FrozenPoison, // Replace by a frozen poison value. We specialize with frozen poisons to prevent propagation + // of poison into the containing value. For instance, ORing a zext'ed non-frozen i32 poison into an + // i64 poisons the whole i64. + Count + }; + + struct DwordSpecializationInfo { + SpecializationKind Kind = SpecializationKind::None; + uint32_t ConstantValue = 0; + }; + + // An instance of this class can be re-used for multiple replacements on multiple values. + // This allows to re-use the builder insertion point, which can lead to nicer (e.g. for tests) IR. + ValueSpecializer(llvm::Module &M); + + // The IR builder stores a reference to us, so forbid copy and move. + ValueSpecializer(const ValueSpecializer &) = delete; + ValueSpecializer(ValueSpecializer &&) = delete; + ValueSpecializer &operator=(const ValueSpecializer &) = delete; + ValueSpecializer &operator=(ValueSpecializer &&) = delete; + + // Replaces dwords in Val according to DwordInfos, and returns the result. + // Returns nullptr on failure, of if nothing was changed. + // + // Val needs to be an instruction or an argument (so we have a function to put new instructions in). + // For arguments, new instructions for specialization are added to the function entry block. + // For instructions, new instructions are added immediately after the specialized instruction. + // + // If ReplaceUses is set, then all uses of Val are replaced with the result, excluding new instructions that + // are added as part of the replacement. + // + // If PreserveExistingBuilderInsertionPoint is set, and this is not the first call of this function, + // we preserve the builder insertion point. In that case, it is the caller's responsibility to ensure that + // the definition of Val dominates the current insertion point. + // If the insertion point is reset, it is set to immediately after the replaced instruction, or after the last + // alloca instruction in the function's entry block for arguments. + // During the replacement, we do not change the insertion point, and just add instructions. + // Thus, it is e.g. safe to preserve the insertion point when only specializing function arguments. + // + // Replacement values of the same type as Val reuse Val's name, plus NameSuffix. + // Temporaries of nested types are not given names. + struct ReplacementResult { + llvm::Value *Replacement; // nullptr if no replacement was done + unsigned NumReplacedDwords; + }; + ReplacementResult replaceDwords(llvm::Value *Val, llvm::ArrayRef DwordInfos, + bool ReplaceUses, bool PreservePreviousInsertionPoint, + llvm::StringRef NameSuffix = ".specialized"); + +private: + // We use a callback to keep track of new instructions, which need to be skipped in the final RAUW. + llvm::IRBuilder B; + const llvm::DataLayout &DL; + llvm::Type *I32 = nullptr; + llvm::Type *I64 = nullptr; + bool IsFirstCall = true; + + // Per-run data: + unsigned NumReplacedDwords = 0; + llvm::SmallDenseSet NewInsts; + + llvm::Value *getFrozenPoison(llvm::Type *Ty) { return B.CreateFreeze(llvm::PoisonValue::get(Ty)); } + llvm::Value *getI32Constant(uint32_t ConstantValue) { return llvm::ConstantInt::get(I32, ConstantValue); } + llvm::Value *getI64Constant(uint64_t ConstantValue) { return llvm::ConstantInt::get(I64, ConstantValue); } + + // Replace dwords in Val according to DwordInfos, and return the result. + // Val may be nullptr if all dwords in DwordInfos are specialized, + // meaning the result does not depend on the initial value. + llvm::Value *replaceDwordsInNonAggregate(llvm::Type *Ty, llvm::Value *Val, + llvm::ArrayRef DwordInfos, + llvm::StringRef ReplacementName); + + // Replaces dwords in RootVal according to DwordInfos. Handles both aggregate as well as non-aggregate types. + // Returns the modified value, and nullptr upon failure, or if nothing was changed. + // + // * RootVal: The value we want to replace dwords to constants in. + // * Indices: If RootVal is an aggregate, these indices point to a nested value in RootVal + // that this recursive function call should handle. In that case, + // CurTy and DwordInfos refer to that nested value. + // * CurTy: Type of the (possibly nested) value within RootVal to change. + // * DwordInfos: Dword-wise infos on what to change. + // + // For aggregate types, it recurses into each element, using the same root value, + // populating Indices and CurTy, and restricting DwordInfos to the sub-range according to the element. + // Once we reach a non-aggregate type, we extractvalue that element, apply the non-aggregate replacement, + // and insertvalue the result. + // In case the whole element is replaced, we skip the extractvalue and start with a frozen poison value instead if + // necessary. + // + // The goal is to emit insertvalue instructions that directly insert into the leaf level, + // instead of first extracting a nested (possibly aggregate!) value, then extracting nested values, + // then specializing the nested value, inserting the nested value into the element value, and then + // inserting the element value into the struct. + // For example, when specializing dword 1 to 17 in { { i32, i32 }, i32 } %foo, we want to emit + // %foo.specialized = insertvalue { { i32, i32 }, i32 } %foo, i32 17, 0, 1 + // instead of the naive + // %nested = extractvalue { { i32, i32 }, i32 } %foo, 0 + // %nested.specialized = insertvalue { i32, i32 } %nested, i32 17, 1 + // %foo.specialized = insertvalue { { i32, i32 }, i32 } %foo, { i32, i32 } %nested.specialized, 0 + // + // For non-aggregates, this is just a wrapper around replaceDwordsInNonAggregate. + llvm::Value *replaceDwordsImpl(llvm::Value *RootVal, llvm::SmallVectorImpl &Indices, llvm::Type *CurTy, + llvm::ArrayRef DwordInfos, llvm::StringRef ReplacementName); +}; + +} // namespace CompilerUtils diff --git a/compilerutils/lib/CompilerUtils.cpp b/compilerutils/lib/CompilerUtils.cpp index e454db298a..80ba2b0d2b 100644 --- a/compilerutils/lib/CompilerUtils.cpp +++ b/compilerutils/lib/CompilerUtils.cpp @@ -24,6 +24,8 @@ **********************************************************************************************************************/ #include "compilerutils/CompilerUtils.h" +#include "ValueOriginTrackingTestPass.h" +#include "ValueSpecializationTestPass.h" #include "compilerutils/DxilToLlvm.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/iterator_range.h" @@ -162,17 +164,6 @@ void CompilerUtils::setIsLastUseLoad(llvm::LoadInst &Load) { namespace { -// Get the name of a global that is copied to a different module for inlining. -std::string getCrossModuleName(GlobalValue &gv) { - if (auto *fn = dyn_cast(&gv)) { - // Intrinsics should not be renamed since the IR verifier insists on a "correct" name mangling based on any - // overloaded types. Lgc dialects also require exact name for similar reason. - if (fn->isIntrinsic() || fn->getName().starts_with("lgc.")) - return fn->getName().str(); - } - return (Twine(gv.getName()) + ".cloned." + gv.getParent()->getName()).str(); -} - class CrossModuleValueMaterializer : public ValueMaterializer { public: CrossModuleValueMaterializer(Module *targetMod, CompilerUtils::CrossModuleInliner &inliner, @@ -198,7 +189,7 @@ class CrossModuleValueMaterializer : public ValueMaterializer { if (auto *existing = inliner->findCopiedGlobal(*gv, *targetMod)) return existing; - auto newName = getCrossModuleName(*gv); + auto newName = CompilerUtils::CrossModuleInliner::getCrossModuleName(*gv); if (auto *callee = dyn_cast(gv)) { if (!callee->isDeclaration()) { report_fatal_error( @@ -386,6 +377,17 @@ GlobalValue *CompilerUtils::CrossModuleInliner::findCopiedGlobal(GlobalValue &so return gv; } +// Get the name of a global that is copied to a different module for inlining. +std::string CompilerUtils::CrossModuleInliner::getCrossModuleName(GlobalValue &gv) { + if (auto *fn = dyn_cast(&gv)) { + // Intrinsics should not be renamed since the IR verifier insists on a "correct" name mangling based on any + // overloaded types. Lgc dialects also require exact name for similar reason. + if (fn->isIntrinsic() || fn->getName().starts_with("lgc.")) + return fn->getName().str(); + } + return (Twine(gv.getName()) + ".cloned." + gv.getParent()->getName()).str(); +} + PointerType *llvm::getWithSamePointeeType(PointerType *ptrTy, unsigned addressSpace) { #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 482880 return PointerType::getWithSamePointeeType(ptrTy, addressSpace); diff --git a/compilerutils/lib/DxilToLlvm.cpp b/compilerutils/lib/DxilToLlvm.cpp index 47afe786ab..0ee0c77d62 100644 --- a/compilerutils/lib/DxilToLlvm.cpp +++ b/compilerutils/lib/DxilToLlvm.cpp @@ -283,6 +283,26 @@ struct DxilToLlvmPassImpl { m_typeLower.eraseInstruction(&extractElement); } + void visitShuffleVector(llvm::ShuffleVectorInst &shuffleVector) { + Value *inputVector0 = shuffleVector.getOperand(0); + Value *inputVector1 = shuffleVector.getOperand(1); + ArrayRef shuffleMask = shuffleVector.getShuffleMask(); + Type *elementTy = cast(inputVector0->getType())->getElementType(); + assert(cast(inputVector1->getType())->getElementType() == elementTy); + + if (convertVectorElementType(elementTy) == nullptr) + return; + + Value *convertedInputVector0 = getConvertedValue(inputVector0); + Value *convertedInputVector1 = getConvertedValue(inputVector1); + + IRBuilder<> builder(&shuffleVector); + auto *replacement = + builder.CreateShuffleVector(convertedInputVector0, convertedInputVector1, shuffleMask, shuffleVector.getName()); + + m_typeLower.replaceInstruction(&shuffleVector, replacement); + } + void visitGEP(llvm::GetElementPtrInst &gepInst) { Type *oldTy = gepInst.getSourceElementType(); Type *newTy = getConvertedType(oldTy); @@ -322,6 +342,7 @@ struct DxilToLlvmPassImpl { .nest(&TypeLowering::registerVisitors) .add(&DxilToLlvmPassImpl::visitInsertElement) .add(&DxilToLlvmPassImpl::visitExtractElement) + .add(&DxilToLlvmPassImpl::visitShuffleVector) .add(&DxilToLlvmPassImpl::visitGEP) .build(); fixFunctionTypes(); diff --git a/compilerutils/lib/PassRegistry.inc b/compilerutils/lib/PassRegistry.inc index 13b599016a..385580b1d6 100644 --- a/compilerutils/lib/PassRegistry.inc +++ b/compilerutils/lib/PassRegistry.inc @@ -38,6 +38,8 @@ #endif COMPILERUTILS_MODULE_PASS("dxil-to-llvm", DxilToLlvmPass()) +COMPILERUTILS_MODULE_PASS("value-origin-tracking-test", ValueOriginTrackingTestPass()) +COMPILERUTILS_MODULE_PASS("value-specialization-test", ValueSpecializationTestPass()) #undef COMPILERUTILS_PASS #undef COMPILERUTILS_MODULE_PASS diff --git a/compilerutils/lib/TypesMetadata.cpp b/compilerutils/lib/TypesMetadata.cpp index 23f0e01004..c69790fe3a 100644 --- a/compilerutils/lib/TypesMetadata.cpp +++ b/compilerutils/lib/TypesMetadata.cpp @@ -101,7 +101,7 @@ TypedFuncTy TypedFuncTy::get(const Function *F) { // Construct a TypedFuncTy for the given result type and arg types. // This constructs the !pointeetys metadata; that can then be attached to a function // using writeMetadata(). -TypedFuncTy::TypedFuncTy(TypedArgTy ResultTy, ArrayRef ArgTys) { +TypedFuncTy::TypedFuncTy(TypedArgTy ResultTy, ArrayRef ArgTys, bool IsVarArg) { SmallVector BareArgTys; SmallVector PointeeTys; unsigned SimpleFormatArgIdx = UINT_MAX; @@ -133,7 +133,7 @@ TypedFuncTy::TypedFuncTy(TypedArgTy ResultTy, ArrayRef ArgTys) { } } } - FuncTy = FunctionType::get(ResultTy.asType(), BareArgTys, /*isVarArg=*/false); + FuncTy = FunctionType::get(ResultTy.asType(), BareArgTys, IsVarArg); if (!PointeeTys.empty()) Meta = MDTuple::get(FuncTy->getContext(), PointeeTys); } @@ -239,6 +239,6 @@ void llvm::DXILValueTypeMetadataCallback(Value *V, unsigned TypeID, GetTypeByIDT else ArgTys.push_back(ArgTy); } - TypedFuncTy(ReturnTy, ArgTys).writeMetadata(cast(V)); + TypedFuncTy(ReturnTy, ArgTys, FuncTy->isVarArg()).writeMetadata(cast(V)); } } diff --git a/compilerutils/lib/ValueOriginTracking.cpp b/compilerutils/lib/ValueOriginTracking.cpp new file mode 100644 index 0000000000..a5f57e7d1f --- /dev/null +++ b/compilerutils/lib/ValueOriginTracking.cpp @@ -0,0 +1,826 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ + +#include "compilerutils/ValueOriginTracking.h" +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_TYPE "value-origin-tracking" + +using namespace CompilerUtils; +using namespace CompilerUtils::ValueTracking; +using namespace llvm; + +namespace CompilerUtils { + +namespace { + +// Given indices into an aggregate type used in {extract,insert}value instructions, +// compute the byte offset of the value indexed by the indices. +static unsigned computeByteOffsetInAggregate(Type *AggTy, ArrayRef Indices, const DataLayout &DL) { + Type *I32 = IntegerType::getInt32Ty(AggTy->getContext()); + // Compute the byte offset of the extracted value by essentially interpreting the indices as GEP indices + // TODO: Can we do this without the GEP hack, and without re-implementing aggregate bit layouts? + SmallVector GEPIndices; + GEPIndices.reserve(Indices.size()); + GEPIndices.push_back(ConstantInt::getSigned(I32, 0)); + for (auto Idx : Indices) + GEPIndices.push_back(ConstantInt::getSigned(I32, Idx)); + + APInt APOffset{32, 0}; + [[maybe_unused]] bool Success = GEPOperator::accumulateConstantOffset(AggTy, GEPIndices, DL, APOffset); + // This should always succeed with constant indices + assert(Success); + + unsigned Offset = APOffset.getZExtValue(); + return Offset; +} + +static std::optional computeByteOffsetInVector(Type *VecTy, Value *IndexArg, const DataLayout &DL) { + auto *ConstantIndex = dyn_cast(IndexArg); + if (!ConstantIndex) + return std::nullopt; + + Type *ElemTy = cast(VecTy)->getElementType(); + unsigned BitWidth = DL.getTypeSizeInBits(ElemTy); + if (BitWidth % 8) + return std::nullopt; + + unsigned Index = ConstantIndex->getZExtValue(); + return Index * (BitWidth / 8); +} + +// Combine slice infos for a select or phi instruction, so we know that our slice equals +// one of the given slices, but we don't know which. +std::optional static combineSliceInfosForSelect(ArrayRef Slices) { + if (Slices.empty()) + return std::nullopt; + if (Slices.size() == 1) + return *Slices[0]; + + SliceInfo Result{SliceStatus::makeEmpty()}; + auto AddResultStatusBit = [&Result](SliceStatus StatusBit) { + assert(StatusBit.isSingleStatus()); + Result.Status = (Result.Status | StatusBit); + }; + + // Set constant if there is a consistent one + { + std::optional OptConstantValue; + for (const SliceInfo *Slice : Slices) { + if (Slice->Status.contains(SliceStatus::Constant)) { + if (!OptConstantValue.has_value()) { + // we are the first to require a specific constant. + OptConstantValue = Slice->ConstantValue; + } else { + // there already is a value. check for consistency. + if (OptConstantValue.value() != Slice->ConstantValue) { + // conflict. + return std::nullopt; + } + } + } + } + if (OptConstantValue.has_value()) { + AddResultStatusBit(SliceStatus::Constant); + Result.ConstantValue = OptConstantValue.value(); + } + } + + // Set dynamic info if there is a consistent one + { + struct DynInfo { + Value *V; + unsigned Offset; + }; + std::optional OptDynInfo; + for (const SliceInfo *Slice : Slices) { + if (Slice->Status.contains(SliceStatus::Dynamic)) { + DynInfo CurDynInfo = {Slice->DynamicValue, Slice->DynamicValueByteOffset}; + if (!OptDynInfo.has_value()) { + // we are the first to require a specific constant. + OptDynInfo = CurDynInfo; + } else { + // there already is a value. check for consistency. + if (OptDynInfo.value().V != CurDynInfo.V || OptDynInfo->Offset != CurDynInfo.Offset) { + // conflict. + return std::nullopt; + } + } + } + } + if (OptDynInfo.has_value()) { + AddResultStatusBit(SliceStatus::Dynamic); + Result.DynamicValue = OptDynInfo->V; + Result.DynamicValueByteOffset = OptDynInfo->Offset; + } + } + + // Check for UndefOrPoison + if (std::any_of(Slices.begin(), Slices.end(), + [](const SliceInfo *Slice) { return Slice->Status.contains(SliceStatus::UndefOrPoison); })) + AddResultStatusBit(SliceStatus::UndefOrPoison); + return Result; +} + +} // namespace + +// Helper class to create ValueInfos +struct ValueOriginTracker::ValueInfoBuilder { + ValueInfoBuilder(const DataLayout &DL, Value *V, unsigned BytesPerSlice, unsigned MaxBytesPerValue) + : V{V}, BytesPerSlice{BytesPerSlice}, MaxBytesPerValue{MaxBytesPerValue}, + NumBits{static_cast(DL.getTypeSizeInBits(V->getType()).getFixedValue())}, + NumBytes{divideCeil(NumBits, 8)}, NumSlices{ + llvm::divideCeil(std::min(NumBytes, MaxBytesPerValue), BytesPerSlice)} {} + + Value *V = nullptr; // The value for which we are building a ValueInfo + unsigned BytesPerSlice = 0; + unsigned MaxBytesPerValue; + unsigned NumBits = 0; + unsigned NumBytes = 0; + unsigned NumSlices = 0; + + // In cases where we can't reason about a slice, we use a dynamic self-referencing slice. + SliceInfo getDynamicSlice(unsigned SliceIdx) const { + SliceInfo SI{SliceStatus::Dynamic}; + SI.DynamicValue = V; + SI.DynamicValueByteOffset = BytesPerSlice * SliceIdx; + return SI; + } + + ValueInfo createUndef() const { + SliceInfo SI{SliceStatus::UndefOrPoison}; + ValueInfo Result{}; + Result.Slices.resize(NumSlices, SI); + return Result; + } + + // Creates a value info for a value that has the given constant on every slice. + ValueInfo createUniformConstant(uint32_t UniformConstantValue) const { + SliceInfo SI{SliceStatus::Constant}; + SI.ConstantValue = UniformConstantValue; + ValueInfo Result{}; + Result.Slices.reserve(NumSlices); + + unsigned BitsPerSlice = 8 * BytesPerSlice; + unsigned NumRemainingBits = NumBits; + + for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) { + if (NumRemainingBits < BitsPerSlice) { + // For the last slice, zero out the upper dead bits. This isn't required by the interface, + // but is simple and leads to nicer tests. + assert(SliceIdx + 1 == NumSlices); + SI.ConstantValue &= (~0u >> (BitsPerSlice - NumRemainingBits)); + Result.Slices.push_back(SI); + break; + } + Result.Slices.push_back(SI); + NumRemainingBits -= BitsPerSlice; + } + return Result; + } + + // Given KnownBits about the value, return a value info that uses constant slices where possible, + // and fall back to dynamic slices if necessary. + // This may be required for slices where not all bits are known. + ValueInfo createConstant(const KnownBits &KB) const { + assert(KB.One.getBitWidth() == NumBits); + ValueInfo Result{}; + Result.Slices.reserve(NumSlices); + unsigned BitsPerSlice = 8 * BytesPerSlice; + unsigned SliceMask = ~0u >> (8 * (4 - BytesPerSlice)); + unsigned NumRemainingBits = NumBits; + auto GetSliceFromAPInt = [&](const APInt &AI, unsigned SliceIdx) -> uint32_t { + assert(BytesPerSlice <= 4); + unsigned DWIdx = (BytesPerSlice * SliceIdx) / 4; + unsigned ByteOffsetInDW = (BytesPerSlice * SliceIdx) % 4; + unsigned QWIdx = DWIdx / 2; + assert(QWIdx < AI.getNumWords()); + auto QW = AI.getRawData()[QWIdx]; + if (DWIdx % 2) { + QW >>= 32; + } + QW >>= (8 * ByteOffsetInDW); + return QW & SliceMask; + }; + for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) { + auto One = GetSliceFromAPInt(KB.One, SliceIdx); + auto Zero = GetSliceFromAPInt(KB.Zero, SliceIdx); + if (NumRemainingBits < BitsPerSlice) { + // For the last slice, accept a partial known mask, because the tail bits are dead + // and not analyzed by KnownBits + SliceMask >>= (BitsPerSlice - NumRemainingBits); + assert(SliceIdx + 1 == NumSlices); + } + if ((One | Zero) == SliceMask) { + SliceInfo SI{SliceStatus::Constant}; + SI.ConstantValue = One; + Result.Slices.push_back(SI); + } else { + // There are unknown bits. Give up on this slice. + Result.Slices.push_back(getDynamicSlice(SliceIdx)); + } + NumRemainingBits -= BitsPerSlice; + } + return Result; + } + + // Return a value info that just refers to the value itself on every slice. This can always be used as fallback. + ValueInfo createDynamic() const { + ValueInfo Result{}; + Result.Slices.reserve(NumSlices); + for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) + Result.Slices.push_back(getDynamicSlice(SliceIdx)); + + return Result; + } + + // Obtain the value info for a sub-range of slices. + ValueInfo createExtraction(const ValueInfo &AggInfo, unsigned ByteOffset) const { + // Note that NumBytes might not be a multiple of slices, and thus + // the last slice of Result might cover data outside of our value. + // But that should be fine, we might just be a bit pessimistic. + if (ByteOffset % BytesPerSlice) { + LLVM_DEBUG(dbgs() << "Non-aligned extract " << *V << ", giving up.\n"); + return createDynamic(); + } + ValueInfo Result{}; + unsigned BeginSlice = ByteOffset / BytesPerSlice; + unsigned ResultNumSlices = NumSlices; + if (BeginSlice < AggInfo.Slices.size()) { + ResultNumSlices = std::min(NumSlices, static_cast(AggInfo.Slices.size() - BeginSlice)); + Result.Slices.append(AggInfo.Slices.begin() + BeginSlice, AggInfo.Slices.begin() + BeginSlice + NumSlices); + } + // Fill up with fallback if necessary + for (unsigned SliceIdx = Result.Slices.size(); SliceIdx < ResultNumSlices; ++SliceIdx) + Result.Slices.push_back(getDynamicSlice(SliceIdx)); + + assert(Result.Slices.size() == ResultNumSlices && ResultNumSlices <= NumSlices); + + return Result; + } + + // Computes a ValueInfo that is obtained by inserting a value at the given byte offset and size + // into this value, e.g. in insert{value, element}. + ValueInfo createInsertion(const ValueInfo &Agg, const ValueInfo &Inserted, unsigned ByteOffset, + unsigned InsertedByteCount) const { + ValueInfo Result = Agg; + unsigned SliceBegin = ByteOffset / BytesPerSlice; + unsigned SliceEnd = + std::min(divideCeil(ByteOffset + InsertedByteCount, BytesPerSlice), Result.Slices.size()); + if (ByteOffset % BytesPerSlice) { + LLVM_DEBUG(dbgs() << "Insertion with non-aligned offset: " << *V << "\n"); + // We don't support merging misaligned slices. Use the fallback for all affected slices. + for (unsigned SliceIdx = SliceBegin; SliceIdx < SliceEnd; ++SliceIdx) + Result.Slices[SliceIdx] = getDynamicSlice(SliceIdx); + + assert(Result.Slices.size() == NumSlices); + return Result; + } + for (unsigned SliceIdx = SliceBegin; SliceIdx < SliceEnd; ++SliceIdx) { + unsigned OtherSliceIdx = SliceIdx - SliceBegin; + if (OtherSliceIdx < Inserted.Slices.size()) + Result.Slices[SliceIdx] = Inserted.Slices[OtherSliceIdx]; + else + Result.Slices.push_back(getDynamicSlice(SliceIdx)); + } + if (InsertedByteCount % BytesPerSlice && SliceBegin < SliceEnd) { + LLVM_DEBUG(dbgs() << "Insertion with non-aligned size " << *V << "\n"); + // The last slice is only partially replaced. + // We don't yet support merging partial slices + Result.Slices[SliceEnd - 1] = getDynamicSlice(SliceEnd - 1); + } + + assert(Result.Slices.size() == NumSlices); + return Result; + } + + // Create a value info for a value that is obtained by selecting one of the given values, + // e.g. in a phi or select instruction. + ValueInfo createSelect(ArrayRef ValueInfos) { + if (ValueInfos.empty()) + return createDynamic(); + if (ValueInfos.size() == 1) + return *ValueInfos[0]; + SmallVector SliceInfos; + SliceInfos.reserve(ValueInfos.size()); + bool Stop = false; + ValueInfo Result; + Result.Slices.reserve(ValueInfos[0]->Slices.size()); + for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) { + SliceInfos.clear(); + for (const ValueInfo *ValueInfo : ValueInfos) { + if (SliceIdx < ValueInfo->Slices.size()) { + SliceInfos.push_back(&ValueInfo->Slices[SliceIdx]); + } else { + // Give up on this and higher slices + Stop = true; + break; + } + } + if (Stop) + break; + std::optional OptSliceInfo = combineSliceInfosForSelect(SliceInfos); + if (OptSliceInfo.has_value()) { + // We succeeded in combining the slices + Result.Slices.push_back(OptSliceInfo.value()); + } else { + // Create dynamic slice + SliceInfo SI{SliceStatus::Dynamic}; + SI.DynamicValue = V; + SI.DynamicValueByteOffset = BytesPerSlice * SliceIdx; + Result.Slices.push_back(SI); + } + } + return Result; + } + + // For each slice, the assumption either gives us constant/undef values, or references + // other dynamic values. ReferencedInfos is indexed by slices and gives value infos for these + // referenced dynamic values. + // This function then combines all these infos accordingly. + ValueInfo createFromAssumption(const ValueInfo &Assumption, ArrayRef ReferencedInfos) { + ValueInfo Result; + assert(Assumption.Slices.size() == ReferencedInfos.size()); + for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) { + if (SliceIdx >= Assumption.Slices.size()) { + // If slices are missing in the assumption, use the dynamic fallback + Result.Slices.push_back(getDynamicSlice(SliceIdx)); + continue; + } + // Start with the assumption, then merge with the referenced info if applicable. + // For non-dynamic assumptions, we just use the assumption directly. + SliceInfo AssumptionSI = Assumption.Slices[SliceIdx]; + assert(AssumptionSI.Status.isSingleStatus()); + if (!AssumptionSI.Status.contains(SliceStatus::Dynamic)) { + Result.Slices.push_back(AssumptionSI); + continue; + } + // No multi-status assumptions are allowed, this would require merging constants here + assert(AssumptionSI.Status == SliceStatus::Dynamic); + const ValueInfo *ReferencedInfo = ReferencedInfos[SliceIdx]; + if (ReferencedInfo != nullptr) { + + if (AssumptionSI.DynamicValueByteOffset % BytesPerSlice) { + // Misaligned assumption, give up on this slice + Result.Slices.push_back(getDynamicSlice(SliceIdx)); + continue; + } + unsigned ReferencedSliceIdx = AssumptionSI.DynamicValueByteOffset / BytesPerSlice; + if (ReferencedSliceIdx >= ReferencedInfo->Slices.size()) { + // No referenced slice available + Result.Slices.push_back(getDynamicSlice(SliceIdx)); + continue; + } + // The assumption references an existing slice info. Use that one. + Result.Slices.push_back(ReferencedInfo->Slices[ReferencedSliceIdx]); + } else { + // Missing reference infos are only allowed for self-references + assert(AssumptionSI.DynamicValue == V); + Result.Slices.push_back(getDynamicSlice(SliceIdx)); + } + } + assert(Result.Slices.size() == NumSlices); + return Result; + } +}; + +// Implement status printing also here, because for multi-bit status we want to interleave the printing +// with the referenced values. +void SliceInfo::print(llvm::raw_ostream &OS, bool Compact) const { + bool IsFirst = true; + auto Sep = Compact ? "|" : " | "; + if (!Status.isSingleStatus()) + OS << "("; + if (Status.contains(SliceStatus::UndefOrPoison)) { + if (!IsFirst) + OS << Sep; + IsFirst = false; + OS << (Compact ? "U" : "UndefOrPoison"); + } + if (Status.contains(SliceStatus::Constant)) { + if (!IsFirst) + OS << Sep; + IsFirst = false; + if (Compact) { + OS << "C"; + } else { + OS << "Constant: 0x"; + OS.write_hex(ConstantValue); + } + } + if (Status.contains(SliceStatus::Dynamic)) { + if (!IsFirst) + OS << Sep; + IsFirst = false; + bool IsArg = isa(DynamicValue); + if (Compact) { + OS << (IsArg ? "A" : "D"); + } else { + OS << "Dynamic" << (IsArg ? " (argument): " : ": ") << *DynamicValue << " (offset " << DynamicValueByteOffset + << ")"; + } + } + if (!Status.isSingleStatus()) + OS << ")"; +} + +llvm::raw_ostream &CompilerUtils::ValueTracking::operator<<(llvm::raw_ostream &OS, const SliceInfo &SI) { + SI.print(OS); + return OS; +} + +void ValueTracking::ValueInfo::print(llvm::raw_ostream &OS, bool Compact) const { + if (Compact) { + for (const auto &Slice : Slices) { + Slice.print(OS, true); + } + } else { + for (const auto &[Idx, Slice] : enumerate(Slices)) { + if (Idx) + OS << "; "; + OS << Slice; + } + } +} + +llvm::raw_ostream &CompilerUtils::ValueTracking::operator<<(llvm::raw_ostream &OS, const ValueInfo &VI) { + VI.print(OS); + return OS; +} + +ValueInfo ValueOriginTracker::computeConstantValueInfo(ValueInfoBuilder &VIB, llvm::Constant *CV) { + if (CV->isNullValue()) + return VIB.createUniformConstant(0); + + // Don't bother with globals we can't reason about + if (isa(CV) || isa(CV) || isa(CV->getType())) + return VIB.createDynamic(); + + auto Ty = CV->getType(); + unsigned BitsPerSlice = 8 * BytesPerSlice; + // Don't bother with dynamic vectors + auto *VectorTy = dyn_cast(Ty); + auto *ArrayTy = dyn_cast(Ty); + Type *ElemTy = nullptr; + unsigned NumElements = 0; + if (VectorTy) { + ElemTy = VectorTy->getElementType(); + NumElements = VectorTy->getNumElements(); + } else if (ArrayTy) { + ElemTy = ArrayTy->getElementType(); + NumElements = ArrayTy->getNumElements(); + } + + // For integer constants, FP constants, and vector-of-integer constants, use computeKnownBits. + // It does not support vector of FP, or arrays. + if (isa(CV) || isa(CV) || (VectorTy && ElemTy->isIntegerTy())) { + // computeKnownBits only supports integers and integer vector types. + // For vector types, it returns common known bits merged across all elements, as wide as single + // element, instead of known bits of the whole value. Thus, cast non-integers to integers first. + Value *ToBeAnalyzed = CV; + if (!CV->getType()->isIntegerTy()) { + unsigned NumBits = DL.getTypeSizeInBits(CV->getType()); + llvm::Type *IntTy = IntegerType::get(CV->getContext(), NumBits); + ToBeAnalyzed = ConstantExpr::getBitCast(CV, IntTy); + } + auto KnownBits = computeKnownBits(ToBeAnalyzed, DL, 2); + return VIB.createConstant(KnownBits); + } + + // The remainder of this function deals with arrays and vectors only. + if (VectorTy == nullptr && ArrayTy == nullptr) + return VIB.createDynamic(); + + auto *ConstDataSeq = dyn_cast(CV); + auto *ConstArr = dyn_cast(CV); + auto *ConstVec = dyn_cast(CV); + assert(ConstDataSeq == nullptr || ConstDataSeq->getNumElements() == NumElements); + assert(ConstArr == nullptr || ConstArr->getNumOperands() == NumElements); + assert(ConstVec == nullptr || ConstVec->getNumOperands() == NumElements); + + if (ConstDataSeq != nullptr || ConstArr != nullptr || ConstVec != nullptr) { + // Array or vector. Try to concatenate the elements infos if possible. + // This is possible if element sizes are slice-aligned, and no padding needs to be considered. + // We could maybe extend the below to structs, but that's even more complicated because + // we need to account for padding on every element, and there can be nested structs, so ignore them for now. + unsigned BitsPerElement = ElemTy->getPrimitiveSizeInBits(); + unsigned AlignedBitsPerElement = VectorTy ? BitsPerElement : 8 * DL.getTypeAllocSize(ElemTy).getFixedValue(); + if (BitsPerElement != AlignedBitsPerElement || BitsPerElement % BitsPerSlice != 0) + return VIB.createDynamic(); + + // Handle constant vector of values whose sizes are integer-multiples of the slice size, + // so we can just concatenate slices element-wise + unsigned SlicesPerElement = BitsPerElement / BitsPerSlice; + ValueInfo Result; + Result.Slices.reserve(SlicesPerElement * NumElements); + for (unsigned ElemIdx = 0; ElemIdx < NumElements; ++ElemIdx) { + // Accessing the element as constant is slightly less efficient, but allows to use the + // computeKnownBits() machinery to obtain bit layouts of floats + llvm::Constant *ElemAsConstant = nullptr; + if (ConstDataSeq) { + ElemAsConstant = ConstDataSeq->getElementAsConstant(ElemIdx); + } else if (ConstArr) { + ElemAsConstant = ConstArr->getOperand(ElemIdx); + } else { + assert(ConstVec != nullptr); + ElemAsConstant = ConstVec->getOperand(ElemIdx); + } + const auto &ValueInfo = getOrComputeValueInfo(ElemAsConstant); + Result.Slices.append(ValueInfo.Slices); + } + return Result; + } + + return VIB.createDynamic(); +} + +ValueInfo ValueOriginTracker::computeValueInfoFromAssumption(ValueInfoBuilder &VIB, const ValueInfo &OriginAssumption) { + SmallVector ReferencedValueInfos; + ReferencedValueInfos.reserve(OriginAssumption.Slices.size()); + for (const auto &AssumptionSliceInfo : OriginAssumption.Slices) { + const ValueInfo *ReferencedValueInfo = nullptr; + if (AssumptionSliceInfo.DynamicValue) { + if (AssumptionSliceInfo.DynamicValue != VIB.V) { + auto ReferencedIt = ValueInfos.find(AssumptionSliceInfo.DynamicValue); + assert(ReferencedIt != ValueInfos.end()); + ReferencedValueInfo = &ReferencedIt->second; + } else { + // The assumption on this slice is trivial, referring to the value itself. + // Leave the nullptr as-is, and handle it in createFromAssumption + } + } + ReferencedValueInfos.push_back(ReferencedValueInfo); + } + return VIB.createFromAssumption(OriginAssumption, ReferencedValueInfos); +} + +// Analyze a value, creating a ValueInfo for it. +// If V is an instruction, this asserts the ValueInfos of dependencies have already been created. +// An exception are PHI nodes: We only support propagation in a single pass, and thus handle loops conservatively, +// treating dependencies on earlier loop iterations as dynamic. Thus, for PHI nodes, if dependencies have not yet +// been analyzed, we assume loop dependencies and give up. +ValueInfo ValueOriginTracker::computeValueInfo(llvm::Value *V) { + ValueInfoBuilder VIB{DL, V, BytesPerSlice, MaxBytesPerValue}; + if (isa(V)) { + return VIB.createUndef(); + } + if (auto *CV = dyn_cast(V)) + return computeConstantValueInfo(VIB, CV); + + Instruction *Inst = dyn_cast(V); + if (!Inst) + return VIB.createDynamic(); + + auto OriginAssumptionIt = OriginAssumptions.find(Inst); + if (OriginAssumptionIt != OriginAssumptions.end()) { + // There is an origin assumption on this instruction. Collect and combine the value infos of referenced values. + // Note: This does not combine with an analysis of V that we would have done without an assumption. + // This can be pessimistic if there are assumptions on values we can analyze, but for now + // this suffices as we only plan to add assumptions on values that are otherwise completely opaque. + return computeValueInfoFromAssumption(VIB, OriginAssumptionIt->second); + } + + switch (Inst->getOpcode()) { + case Instruction::AddrSpaceCast: + case Instruction::BitCast: + case Instruction::Freeze: { + // Just forward the operand for size-preserving type conversions and freeze + auto *Op = Inst->getOperand(0); + auto It = ValueInfos.find(Op); + assert(It != ValueInfos.end()); + return It->second; + } + case Instruction::ExtractElement: { + auto *EE = cast(Inst); + auto *Vec = EE->getVectorOperand(); + auto *IndexArg = EE->getIndexOperand(); + + std::optional Offset = computeByteOffsetInVector(Vec->getType(), IndexArg, DL); + if (!Offset.has_value()) + return VIB.createDynamic(); + + // Obtain ValueInfo for the source aggregate + auto It = ValueInfos.find(Vec); + assert(It != ValueInfos.end()); + const ValueInfo &SrcInfo = It->second; + + // Extract extracted slices + return VIB.createExtraction(SrcInfo, *Offset); + } + case Instruction::ExtractValue: { + auto *EV = cast(Inst); + auto *Src = EV->getAggregateOperand(); + + unsigned Offset = computeByteOffsetInAggregate(Src->getType(), EV->getIndices(), DL); + + // Obtain ValueInfo for the source aggregate + auto It = ValueInfos.find(Src); + assert(It != ValueInfos.end()); + const ValueInfo &SrcInfo = It->second; + + // Extract extracted slices + return VIB.createExtraction(SrcInfo, Offset); + } + case Instruction::InsertElement: { + // TODO: Support shufflevector + auto *IE = cast(Inst); + auto *Vec = IE->getOperand(0); + auto *Inserted = IE->getOperand(1); + auto *IndexArg = IE->getOperand(2); + + std::optional Offset = computeByteOffsetInVector(Vec->getType(), IndexArg, DL); + if (!Offset.has_value()) + return VIB.createDynamic(); + + auto VecIt = ValueInfos.find(Vec); + auto InsertedIt = ValueInfos.find(Inserted); + assert(VecIt != ValueInfos.end() && InsertedIt != ValueInfos.end()); + const auto &VecInfo = VecIt->second; + const auto &InsertedInfo = InsertedIt->second; + unsigned NumInsertedBits = Inserted->getType()->getPrimitiveSizeInBits(); + assert(NumInsertedBits % 8 == 0 && NumInsertedBits == 8 * DL.getTypeStoreSize(Inserted->getType())); + unsigned NumInsertedBytes = NumInsertedBits / 8; + + // Combine AggInfo and InsertedInfo + return VIB.createInsertion(VecInfo, InsertedInfo, *Offset, NumInsertedBytes); + } + case Instruction::InsertValue: { + auto *IV = cast(Inst); + auto *Agg = IV->getAggregateOperand(); + auto *Inserted = IV->getInsertedValueOperand(); + auto AggIt = ValueInfos.find(Agg); + auto InsertedIt = ValueInfos.find(Inserted); + assert(AggIt != ValueInfos.end() && InsertedIt != ValueInfos.end()); + + const auto &AggInfo = AggIt->second; + const auto &InsertedInfo = InsertedIt->second; + + unsigned Offset = computeByteOffsetInAggregate(Agg->getType(), IV->getIndices(), DL); + unsigned NumInsertedBytes = DL.getTypeStoreSize(Inserted->getType()); + + // Combine AggInfo and InsertedInfo + return VIB.createInsertion(AggInfo, InsertedInfo, Offset, NumInsertedBytes); + } + case Instruction::PHI: { + auto *PN = cast(Inst); + SmallVector ArgValueInfos; + for (Value *Val : PN->incoming_values()) { + auto It = ValueInfos.find(Val); + if (It == ValueInfos.end()) { + // The incoming value has not been analyzed yet. + // This can be caused by a loop, which we currently don't support. + // We could repeatedly propagate through the loop until a stable state is reached. + return VIB.createDynamic(); + } + ArgValueInfos.push_back(&It->second); + } + return VIB.createSelect(ArgValueInfos); + } + case Instruction::Select: { + auto *SI = cast(Inst); + auto *TrueVal = SI->getTrueValue(); + auto *FalseVal = SI->getFalseValue(); + auto TrueIt = ValueInfos.find(TrueVal); + auto FalseIt = ValueInfos.find(FalseVal); + assert(TrueIt != ValueInfos.end() && FalseIt != ValueInfos.end()); + + const auto &TrueInfo = TrueIt->second; + const auto &FalseInfo = FalseIt->second; + + return VIB.createSelect({&TrueInfo, &FalseInfo}); + } + // For these instructions, don't waste time trying to compute known bits + case Instruction::Call: + case Instruction::GetElementPtr: + case Instruction::Load: + case Instruction::PtrToInt: // PtrToInt and IntToPtr could be supported, but modeling the trunc/zext + case Instruction::IntToPtr: // part is annoying, and we don't need it now. + case Instruction::Store: { + return VIB.createDynamic(); + } + default: { + // As last option, try to use computeKnownBits if possible. + // computeKnownBits also supports vector type, but in that case returns bits common bits of all elements. + // We are however interested in bits of the whole value. Working on the full vector would require a bitcast + // to an integer, but we don't wan't to add instructions in the analysis. + if (V->getType()->isIntegerTy()) { + auto KnownBits = computeKnownBits(V, DL); + return VIB.createConstant(KnownBits); + } + return VIB.createDynamic(); + } + } + llvm_unreachable("unexpected case"); +} + +ValueInfo &ValueOriginTracker::getOrComputeValueInfo(llvm::Value *V, bool KnownToBeNew) { + if (!KnownToBeNew) { + auto It = ValueInfos.find(V); + if (It != ValueInfos.end()) + return It->second; + } + auto InsertionResult = ValueInfos.insert({V, computeValueInfo(V)}); + assert(InsertionResult.second); + return InsertionResult.first->second; +} + +ValueInfo ValueOriginTracker::getValueInfo(llvm::Value *V) { + analyzeValues(V); + assert(ValueInfos.contains(V)); + return ValueInfos[V]; +} + +void ValueOriginTracker::analyzeValues(ArrayRef Values) { + SmallVector WorkList; + SetVector PendingFunctions; + DenseSet PendingBBs; + DenseSet PendingInstructions; + + // Collect all values that the passed values depend on, by working through + // all operands. Instructions are marked in PendingInstructions for later + // processing, other values are directly processed. + + auto AddToWorkList = [&](Value *V) { + if (ValueInfos.contains(V)) { + // Already analyzed, nothing to do + return; + } + if (auto *Inst = dyn_cast(V)) { + bool Inserted = PendingInstructions.insert(Inst).second; + if (Inserted) { + WorkList.push_back(Inst); + if (PendingBBs.insert(Inst->getParent()).second) + PendingFunctions.insert(Inst->getFunction()); + } + } else { + // With general value assumptions, we'd need to add something here to ensure processing of dependencies. + static_assert(std::is_same_v); + getOrComputeValueInfo(V, true); + } + }; + + for (auto *V : Values) + AddToWorkList(V); + + while (!WorkList.empty()) { + // Add instruction operands to the work list + auto *Inst = WorkList.pop_back_val(); + for (auto &Op : Inst->operands()) + AddToWorkList(Op); + + // Add any instructions referenced by origin assumptions to the work list as well + auto OriginAssumptionIt = OriginAssumptions.find(Inst); + if (OriginAssumptionIt != OriginAssumptions.end()) { + const ValueInfo &VI = OriginAssumptionIt->second; + for (const SliceInfo &SI : VI.Slices) { + if (SI.DynamicValue) + AddToWorkList(SI.DynamicValue); + } + } + } + + for (auto *F : PendingFunctions) { + // Traverse BBs of the function in RPO order. + // This ensures instruction dependencies are analyzed before depending instructions, except for loops. + ReversePostOrderTraversal RPOT(F); + for (auto &BB : RPOT) { + if (!PendingBBs.contains(BB)) + continue; + for (auto &Inst : *BB) { + bool WasPending = PendingInstructions.erase(&Inst); + if (WasPending) + getOrComputeValueInfo(&Inst, true); + } + } + } +} + +} // namespace CompilerUtils diff --git a/compilerutils/lib/ValueOriginTrackingTestPass.cpp b/compilerutils/lib/ValueOriginTrackingTestPass.cpp new file mode 100644 index 0000000000..ff682f5d2c --- /dev/null +++ b/compilerutils/lib/ValueOriginTrackingTestPass.cpp @@ -0,0 +1,133 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ + +#include "ValueOriginTrackingTestPass.h" +#include "compilerutils/CompilerUtils.h" +#include "compilerutils/ValueOriginTracking.h" +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace CompilerUtils; + +namespace { + +cl::opt BytesPerSliceOption("value-origin-tracking-test-bytes-per-slice", cl::init(4)); +cl::opt MaxBytesPerValueOption("value-origin-tracking-test-max-bytes-per-value", cl::init(512)); + +// Parse assumptions made via calls to the assume function. +ValueOriginTracker::ValueOriginAssumptions parseAssumptions(Module &Module, Function &AssumeFunc) { + ValueOriginTracker::ValueOriginAssumptions Result; + forEachCall(AssumeFunc, [&](CallInst &AssumptionCall) { + unsigned NumArgs = AssumptionCall.arg_size(); + // We expect one arg for the value, and two per slice. + if (NumArgs % 2 != 1) + report_fatal_error("unexpected number of assumption args"); + // The value we put an assumption on + Value *V = AssumptionCall.getArgOperand(0); + Instruction *Inst = dyn_cast(V); + if (Inst == nullptr) + report_fatal_error("assumptions are only allowed on instructions"); + ValueOriginTracker::ValueInfo Assumption{}; + unsigned NumSlices = (NumArgs - 1) / 2; + for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) { + unsigned SliceArgBeginIdx = 1 + 2 * SliceIdx; + Value *ReferencedValueOrConstant = AssumptionCall.getArgOperand(SliceArgBeginIdx); + if (isa(ReferencedValueOrConstant)) { + Assumption.Slices.push_back({ValueTracking::SliceStatus::UndefOrPoison}); + } else if (auto *CIValue = dyn_cast(ReferencedValueOrConstant)) { + ValueTracking::SliceInfo SI{ValueTracking::SliceStatus::Constant}; + if (!CIValue->getType()->isIntegerTy(32)) + report_fatal_error("expected i32 constant"); + SI.ConstantValue = CIValue->getZExtValue(); + Assumption.Slices.push_back(SI); + } else { + // Dynamic value reference + ValueTracking::SliceInfo SI{ValueTracking::SliceStatus::Dynamic}; + SI.DynamicValue = ReferencedValueOrConstant; + Value *DynamicValueByteOffsetValue = AssumptionCall.getArgOperand(SliceArgBeginIdx + 1); + auto *DynamicValueByteOffsetValueCI = dyn_cast(DynamicValueByteOffsetValue); + if (DynamicValueByteOffsetValueCI == nullptr || !DynamicValueByteOffsetValueCI->getType()->isIntegerTy(32)) + report_fatal_error("expected i32 constant"); + SI.DynamicValueByteOffset = DynamicValueByteOffsetValueCI->getZExtValue(); + Assumption.Slices.push_back(SI); + } + } + bool Inserted = Result.insert({Inst, Assumption}).second; + if (!Inserted) + report_fatal_error("value with duplicate assumption"); + }); + return Result; +} + +} // namespace + +namespace CompilerUtils { + +llvm::PreservedAnalyses ValueOriginTrackingTestPass::run(llvm::Module &Module, + llvm::ModuleAnalysisManager &AnalysisManager) { + Function *AnalyzeFunc = Module.getFunction("analyze"); + if (!AnalyzeFunc) + return PreservedAnalyses::all(); + + ValueOriginTracker::ValueOriginAssumptions Assumptions; + Function *AssumeFunc = Module.getFunction("assume"); + if (AssumeFunc) { + Assumptions = parseAssumptions(Module, *AssumeFunc); + } + + ValueOriginTracker VOT{Module.getDataLayout(), BytesPerSliceOption.getValue(), MaxBytesPerValueOption.getValue(), + Assumptions}; + + auto Prefix = "[VOT]: "; + + // Traverse all functions instead of the users of AnalyzeFunc to group output by function + for (auto &F : Module) { + if (F.isDeclaration()) + continue; + + outs() << Prefix << F.getName() << "\n"; + for (auto &BB : F) { + for (auto &I : BB) { + auto *CI = dyn_cast(&I); + if (!CI || CI->getCalledOperand() != AnalyzeFunc) { + continue; + } + + for (Value *Op : CI->data_ops()) { + auto VI = VOT.getValueInfo(Op); + outs() << Prefix << "(" << *Op << "): " << VI << "\n"; + } + outs() << "\n"; + } + } + } + return PreservedAnalyses::all(); +} + +} // namespace CompilerUtils diff --git a/compilerutils/lib/ValueOriginTrackingTestPass.h b/compilerutils/lib/ValueOriginTrackingTestPass.h new file mode 100644 index 0000000000..3505e0c085 --- /dev/null +++ b/compilerutils/lib/ValueOriginTrackingTestPass.h @@ -0,0 +1,42 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" + +namespace CompilerUtils { + +// Helper pass to enable lit tests of ValueOriginTracker. +// Calls to a function called "analyze" triggers an analysis its arguments and outputs the analysis to stdout. +class ValueOriginTrackingTestPass : public llvm::PassInfoMixin { +public: + llvm::PreservedAnalyses run(llvm::Module &Module, llvm::ModuleAnalysisManager &AnalysisManager); + + static llvm::StringRef name() { return "Test ValueOriginTracking"; } +}; + +} // namespace CompilerUtils diff --git a/compilerutils/lib/ValueSpecialization.cpp b/compilerutils/lib/ValueSpecialization.cpp new file mode 100644 index 0000000000..4dda822ec0 --- /dev/null +++ b/compilerutils/lib/ValueSpecialization.cpp @@ -0,0 +1,358 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ + +#include "compilerutils/ValueSpecialization.h" +#include "llvm/IR/Module.h" +#include + +#define DEBUG_TYPE "value-specialization" + +using namespace CompilerUtils; +using namespace llvm; + +namespace CompilerUtils { + +namespace { + +struct SpecializationSummary { + bool AllDwordsAreSpecialized = true; + bool AnyDwordIsSpecialized = false; +}; +static SpecializationSummary +computeSpecializationSummary(ArrayRef DwordInfos) { + SpecializationSummary Result = {}; + for (const auto &DWI : DwordInfos) { + if (DWI.Kind != ValueSpecializer::SpecializationKind::None) + Result.AnyDwordIsSpecialized = true; + else + Result.AllDwordsAreSpecialized = false; + } + return Result; +} + +} // namespace + +ValueSpecializer::ValueSpecializer(Module &M) + : B{M.getContext(), ConstantFolder{}, + IRBuilderCallbackInserter{[this](Instruction *Inst) { NewInsts.insert(Inst); }}}, + DL{M.getDataLayout()}, I32{Type::getInt32Ty(M.getContext())}, I64{Type::getInt64Ty(M.getContext())}, + NumReplacedDwords{}, NewInsts{} { +} + +ValueSpecializer::ReplacementResult +ValueSpecializer::replaceDwords(Value *Val, ArrayRef DwordInfos, bool ReplaceUses, + bool PreservePreviousInsertionPoint, StringRef NameSuffix) { + assert(divideCeil(DL.getTypeStoreSize(Val->getType()), 4) == DwordInfos.size()); + NewInsts.clear(); + NumReplacedDwords = 0; + + if (IsFirstCall || !PreservePreviousInsertionPoint) { + if (auto *Arg = dyn_cast(Val)) { + B.SetInsertPoint(Arg->getParent()->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); + } else { + // Insert *after* the given instruction, so we can use it + auto *Inst = cast(Val); + B.SetInsertPoint(Inst->getInsertionPointAfterDef().value()); + } + } + IsFirstCall = false; + + SmallVector Indices; + Value *Replacement = replaceDwordsImpl(Val, Indices, Val->getType(), DwordInfos, (Val->getName() + NameSuffix).str()); + // Should be nullptr if nothing changed + assert(Replacement != Val); + if (Replacement != nullptr && ReplaceUses) { + Val->replaceUsesWithIf(Replacement, [this](Use &U) -> bool { return !NewInsts.contains(U.getUser()); }); + } + return {Replacement, NumReplacedDwords}; +} + +Value *ValueSpecializer::replaceDwordsInNonAggregate(Type *Ty, Value *Val, ArrayRef DwordInfos, + StringRef ReplacementName) { + assert(!Ty->isAggregateType()); + + unsigned NumBytes = DL.getTypeStoreSize(Ty); + if (NumBytes % 4) { + // Small and misaligned types are not supported for now. + // We could support specializing prefixes of large, misaligned types later. + return nullptr; + } + [[maybe_unused]] unsigned NumDwords = NumBytes / 4; + assert(DwordInfos.size() == NumDwords); + + if (Ty->isIntegerTy()) { + if (Ty->getIntegerBitWidth() < 32) + return nullptr; + if (Ty->getIntegerBitWidth() == 32) { + const DwordSpecializationInfo &DWI = DwordInfos[0]; + if (DWI.Kind == SpecializationKind::Constant) { + ++NumReplacedDwords; + return getI32Constant(DWI.ConstantValue); + } + if (DWI.Kind == SpecializationKind::FrozenPoison) { + ++NumReplacedDwords; + return getFrozenPoison(Ty); + } + return nullptr; + } + if (Ty->getIntegerBitWidth() == 64) { + const DwordSpecializationInfo &LowInfo = DwordInfos[0]; + const DwordSpecializationInfo &HighInfo = DwordInfos[1]; + SpecializationKind LowKind = LowInfo.Kind; + SpecializationKind HighKind = HighInfo.Kind; + + if (LowKind == HighKind) { + // This can be handled without a bitwise or. + NumReplacedDwords += 2; + if (LowKind == SpecializationKind::Constant) { + // return a single i64 constant. + uint64_t I64Constant = HighInfo.ConstantValue; + I64Constant <<= 32; + I64Constant |= LowInfo.ConstantValue; + return getI64Constant(I64Constant); + } + assert(LowKind == SpecializationKind::FrozenPoison); + return getFrozenPoison(I64); + } + + // Create two separate i64s containing the low and high dwords, and OR them together. + uint64_t SingleDwordMask = ~(uint32_t{0}); + Value *LowDword = nullptr; + if (LowKind == SpecializationKind::None) { + assert(Val); + LowDword = B.CreateAnd(Val, SingleDwordMask); + } else { + ++NumReplacedDwords; + if (LowKind == SpecializationKind::Constant) { + LowDword = getI64Constant(LowInfo.ConstantValue); + } else { + assert(LowKind == SpecializationKind::FrozenPoison); + LowDword = B.CreateAnd(getFrozenPoison(I64), SingleDwordMask); + } + } + + Value *HighDword = nullptr; + if (HighKind == SpecializationKind::None) { + assert(Val); + HighDword = B.CreateAnd(Val, SingleDwordMask << 32); + } else { + ++NumReplacedDwords; + if (HighKind == SpecializationKind::Constant) { + uint64_t HighDwordConstant = HighInfo.ConstantValue; + HighDwordConstant <<= 32; + HighDword = getI64Constant(HighDwordConstant); + } else { + assert(HighKind == SpecializationKind::FrozenPoison); + HighDword = B.CreateAnd(getFrozenPoison(I64), SingleDwordMask << 32); + } + } + + return B.CreateOr(LowDword, HighDword, ReplacementName); + } + + // Give up on other types + return nullptr; + } + + bool IsPointer = Ty->isPointerTy(); + if (Ty->isFloatingPointTy() || IsPointer) { + unsigned BitWidth = 0; + if (auto *PtrTy = dyn_cast(Ty)) + BitWidth = DL.getPointerSizeInBits(PtrTy->getAddressSpace()); + else + BitWidth = Ty->getScalarSizeInBits(); + + if (BitWidth < 32) + return nullptr; + + // Reduce this to integer specialization + Type *IntTy = IntegerType::get(Ty->getContext(), BitWidth); + Value *BaseValue = nullptr; + if (Val) { + // Need to preserve some data, so start with bitcast of original value + if (IsPointer) + BaseValue = B.CreatePtrToInt(Val, IntTy); + else + BaseValue = B.CreateBitCast(Val, IntTy); + } + Value *SpecializedAsInt = replaceDwordsInNonAggregate(IntTy, BaseValue, DwordInfos, {}); + if (!SpecializedAsInt) + return nullptr; + + if (IsPointer) + return B.CreateIntToPtr(SpecializedAsInt, Ty, ReplacementName); + return B.CreateBitCast(SpecializedAsInt, Ty, ReplacementName); + } + + // Last remaining case: vectors. + if (isa(Ty)) { + // Not supported. + return nullptr; + } + auto *VTy = cast(Ty); + // Similar to the aggregate case: For small elements, give up. + // For dword-sized elements, just insert the new value. + // For larger elements, extract the value, update it, and insert it again. + Type *ElemTy = VTy->getElementType(); + if (!ElemTy->isIntegerTy() && !ElemTy->isFloatingPointTy()) { + // E.g. pointers, not supported. Could add support if necessary. + return nullptr; + } + unsigned NumElems = VTy->getNumElements(); + unsigned ElemNumBits = ElemTy->getPrimitiveSizeInBits(); + if (ElemNumBits % 32) { + // Give up. + return nullptr; + } + unsigned ElemNumDwords = ElemNumBits / 32; + + // While working on the vector elements, keep track of the current replaced full vector value. + Value *ReplacedVector = Val; + for (unsigned ElemIdx = 0; ElemIdx < NumElems; ++ElemIdx) { + unsigned ElemDwordBegin = ElemIdx * ElemNumDwords; + unsigned ElemDwordEnd = ElemDwordBegin + ElemNumDwords; + assert(ElemDwordEnd <= DwordInfos.size()); + + ArrayRef ElemDwordInfos{DwordInfos.data() + ElemDwordBegin, + DwordInfos.data() + ElemDwordEnd}; + + auto Summary = computeSpecializationSummary(ElemDwordInfos); + if (!Summary.AnyDwordIsSpecialized) { + // Nothing to do on this vector element. + assert(Val != nullptr); + continue; + } + + Value *ElemBaseValue = Summary.AllDwordsAreSpecialized ? nullptr : B.CreateExtractElement(ReplacedVector, ElemIdx); + Value *ReplacedElem = replaceDwordsInNonAggregate(ElemTy, ElemBaseValue, ElemDwordInfos, {}); + if (ReplacedElem) { + if (ReplacedVector == nullptr) { + // Start with a frozen poison value + ReplacedVector = getFrozenPoison(Ty); + } + ReplacedVector = B.CreateInsertElement(ReplacedVector, ReplacedElem, ElemIdx, ReplacementName); + } + } + + // Return nullptr if nothing changed. + return ReplacedVector != Val ? ReplacedVector : nullptr; +} + +Value *ValueSpecializer::replaceDwordsImpl(Value *RootVal, SmallVectorImpl &Indices, Type *CurTy, + ArrayRef DwordInfos, StringRef ReplacementName) { + assert(RootVal && CurTy); + + auto Summary = computeSpecializationSummary(DwordInfos); + if (!Summary.AnyDwordIsSpecialized) { + // Nothing to be done. + return nullptr; + } + + if (!CurTy->isAggregateType()) { + // Base value to perform non-aggregate specialization on. Nullptr if all dwords are replaced. + // The called specialization function then creates a base frozen poison value if necessary. + // This might not be necessary in some cases, e.g. for a dword-sized value like an i32. + Value *BaseValue = nullptr; + if (!Summary.AllDwordsAreSpecialized) { + if (Indices.empty()) { + assert(RootVal->getType() == CurTy); + BaseValue = RootVal; + } else { + // We are part of a (possibly nested) aggregate. Extract our value to work on it. + BaseValue = B.CreateExtractValue(RootVal, Indices); + } + } + + // If the result of this call is going to be the final result, forward the replacement name. + // Otherwise, we will create an insertvalue instruction that will get the name. + StringRef NestedReplacementName = Indices.empty() ? ReplacementName : ""; + Value *Replaced = replaceDwordsInNonAggregate(CurTy, BaseValue, DwordInfos, NestedReplacementName); + if (!Replaced) + return nullptr; + + if (Indices.empty()) + return Replaced; + + // Insert the replacement into the root value + return B.CreateInsertValue(RootVal, Replaced, Indices, ReplacementName); + } + + // Final case: Aggregates + assert(CurTy->isAggregateType()); + + const StructLayout *SL = nullptr; + ArrayType *ArrTy = dyn_cast(CurTy); + StructType *STy = dyn_cast(CurTy); + unsigned NumElements = -1; + if (ArrTy) { + NumElements = ArrTy->getNumElements(); + } else { + NumElements = STy->getNumElements(); + SL = DL.getStructLayout(STy); + } + + // While working on the aggregate elements, keep track of the current replaced full aggregate value. + Value *ReplacedRootVal = RootVal; + for (unsigned ElemIdx = 0; ElemIdx < NumElements; ++ElemIdx) { + // Determine byte range covered by the element + unsigned ElemByteOffset = -1; + Type *ElemTy = nullptr; + if (ArrTy) { + ElemTy = ArrTy->getElementType(); + unsigned ElemAllocSize = DL.getTypeAllocSize(ElemTy); + ElemByteOffset = ElemIdx * ElemAllocSize; + } else { + ElemTy = STy->getElementType(ElemIdx); + ElemByteOffset = SL->getElementOffset(ElemIdx); + } + unsigned ElemByteSize = DL.getTypeStoreSize(ElemTy); + + if (ElemByteOffset % 4 != 0 || ElemByteSize % 4 != 0) { + // Give up on small/misaligned types + continue; + } + + // The element corresponds to a sub-range of CurDwordInfos. Determine it. + unsigned ElemDwordBegin = ElemByteOffset / 4; + unsigned ElemNumDwords = ElemByteSize / 4; + unsigned ElemDwordEnd = ElemDwordBegin + ElemNumDwords; + assert(ElemDwordEnd <= DwordInfos.size()); + + ArrayRef ElemDwordInfos{DwordInfos.data() + ElemDwordBegin, + DwordInfos.data() + ElemDwordEnd}; + Indices.push_back(ElemIdx); + Value *Replaced = replaceDwordsImpl(ReplacedRootVal, Indices, ElemTy, ElemDwordInfos, ReplacementName); + Indices.pop_back(); + if (Replaced) { + // Replacement was successful. In the next iteration, use Replaced as base value to operate on. + ReplacedRootVal = Replaced; + } + } + + // Return nullptr if nothing changed + return ReplacedRootVal != RootVal ? ReplacedRootVal : nullptr; +} + +} // namespace CompilerUtils diff --git a/compilerutils/lib/ValueSpecializationTestPass.cpp b/compilerutils/lib/ValueSpecializationTestPass.cpp new file mode 100644 index 0000000000..884ce1733a --- /dev/null +++ b/compilerutils/lib/ValueSpecializationTestPass.cpp @@ -0,0 +1,160 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ + +#include "ValueSpecializationTestPass.h" +#include "compilerutils/CompilerUtils.h" +#include "compilerutils/ValueOriginTracking.h" +#include "compilerutils/ValueSpecialization.h" +#include +#include + +using namespace llvm; +using namespace CompilerUtils; + +namespace { + +enum TestFlags { SkipValueTrackingCheck = 0x1, AllowFailure = 0x2, Invalid = 0x4 }; + +struct ValueSpecializationInfo { + llvm::Value *Val; + llvm::SmallVector DwordInfos; + unsigned NumToBeReplacedDwords = 0; + TestFlags Flags; +}; + +// Syntax: +// call @specialize(i32 %flags, %val, i32 dw0Status, i32 dw0Constant, [i32 dw1Status, i32 dw1Constant, ...]) +ValueSpecializationInfo parseSpecializeCall(llvm::CallInst &CI) { + unsigned NumArgs = CI.arg_size(); + if (NumArgs % 2 != 0) + report_fatal_error("Unexpected num args for specialize"); + unsigned NumDwords = (NumArgs - 2) / 2; + + llvm::SmallVector DwordInfos; + unsigned NumReplacedDwords = 0; + DwordInfos.reserve(NumDwords); + for (unsigned DwordIdx = 0; DwordIdx < NumDwords; ++DwordIdx) { + llvm::Value *KindValue = CI.getArgOperand(2 + 2 * DwordIdx); + if (!isa(KindValue)) + report_fatal_error("Unexpected non-integer kind argument"); + auto KindInt = cast(KindValue)->getZExtValue(); + if (KindInt >= static_cast(ValueSpecializer::SpecializationKind::Count)) + report_fatal_error("Invalid specialization kind"); + auto Kind = static_cast(KindInt); + uint32_t Constant = 0; + if (Kind == ValueSpecializer::SpecializationKind::Constant) { + llvm::Value *ConstantValueValue = CI.getArgOperand(2 + 2 * DwordIdx + 1); + if (!isa(ConstantValueValue)) + report_fatal_error("Unexpected non-integer constant value argument"); + auto ConstantValueInt = cast(ConstantValueValue)->getZExtValue(); + if (ConstantValueInt >= UINT32_MAX) + report_fatal_error("Too large constant value"); + Constant = static_cast(ConstantValueInt); + } + DwordInfos.push_back({Kind, Constant}); + if (Kind != ValueSpecializer::SpecializationKind::None) + ++NumReplacedDwords; + } + Value *TestFlagsValue = CI.getArgOperand(0); + if (!isa(TestFlagsValue)) + report_fatal_error("Unexpected non-integer constant value argument"); + auto TestFlagsInt = cast(TestFlagsValue)->getZExtValue(); + if (TestFlagsInt >= static_cast(TestFlags::Invalid)) + report_fatal_error("Invalid test flags value"); + return {CI.getArgOperand(1), DwordInfos, NumReplacedDwords, static_cast(TestFlagsInt)}; +} + +} // namespace + +namespace CompilerUtils { + +llvm::PreservedAnalyses ValueSpecializationTestPass::run(llvm::Module &Module, + llvm::ModuleAnalysisManager &AnalysisManager) { + Function *SpecializeFunc = Module.getFunction("specialize"); + if (!SpecializeFunc) + return PreservedAnalyses::all(); + + SmallVector ToBeDeleted; + for (auto &F : Module) { + for (auto &BB : F) { + // Use one specialize per BB, and re-use insertion points. + ValueSpecializer VS(Module); + + for (auto &Inst : BB) { + auto *CI = dyn_cast(&Inst); + if (!CI || CI->getCalledOperand() != SpecializeFunc) { + continue; + } + ToBeDeleted.push_back(CI); + + ValueSpecializationInfo VSI = parseSpecializeCall(*CI); + bool ReplaceUses = true; + bool PreserveInsertionPoint = true; + const auto [Replacement, NumReplacedDwords] = + VS.replaceDwords(VSI.Val, VSI.DwordInfos, ReplaceUses, PreserveInsertionPoint); + + if (!(VSI.Flags & TestFlags::AllowFailure) && NumReplacedDwords != VSI.NumToBeReplacedDwords) + report_fatal_error("Less than expected replacements"); + if (NumReplacedDwords != 0 && Replacement == nullptr) + report_fatal_error("Missing replacement result"); + + if (Replacement && !(VSI.Flags & TestFlags::SkipValueTrackingCheck)) { + // Run value tracking analysis on the replacement result, and check that it matches the requested replacements + ValueOriginTracker VOT{Module.getDataLayout(), 4, 256}; + const ValueTracking::ValueInfo VI = VOT.getValueInfo(Replacement); + if (VI.Slices.size() != VSI.DwordInfos.size()) + report_fatal_error("Size mismatch"); + for (unsigned DwordIdx = 0; DwordIdx < VI.Slices.size(); ++DwordIdx) { + const ValueTracking::SliceInfo &SI = VI.Slices[DwordIdx]; + const ValueSpecializer::DwordSpecializationInfo &DSI = VSI.DwordInfos[DwordIdx]; + if (DSI.Kind == ValueSpecializer::SpecializationKind::Constant) { + if (SI.Status != ValueTracking::SliceStatus::Constant || SI.ConstantValue != DSI.ConstantValue) + report_fatal_error("Failed constant specialization"); + } + if (DSI.Kind == ValueSpecializer::SpecializationKind::FrozenPoison) { + if (SI.Status != ValueTracking::SliceStatus::UndefOrPoison) + report_fatal_error("Failed frozen poison specialization"); + } + } + } + + dbgs() << "[VS]: Replaced " << NumReplacedDwords << " dwords in "; + VSI.Val->printAsOperand(dbgs()); + if (Replacement) { + dbgs() << ", replaced by "; + Replacement->printAsOperand(dbgs()); + } + dbgs() << "\n"; + } + } + } + + for (auto *CI : ToBeDeleted) + CI->eraseFromParent(); + + return PreservedAnalyses::none(); +} + +} // namespace CompilerUtils diff --git a/compilerutils/lib/ValueSpecializationTestPass.h b/compilerutils/lib/ValueSpecializationTestPass.h new file mode 100644 index 0000000000..20465412ef --- /dev/null +++ b/compilerutils/lib/ValueSpecializationTestPass.h @@ -0,0 +1,42 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" + +namespace CompilerUtils { + +// Helper pass to enable lit tests of ValueSpecializer. +// Calls to a function named "specialize" trigger value specialization. +class ValueSpecializationTestPass : public llvm::PassInfoMixin { +public: + llvm::PreservedAnalyses run(llvm::Module &Module, llvm::ModuleAnalysisManager &AnalysisManager); + + static llvm::StringRef name() { return "Test ValueSpecialization"; } +}; + +} // namespace CompilerUtils diff --git a/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil b/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil index 3eae8bf5ab..1bf2fa9f61 100644 --- a/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil +++ b/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil @@ -94,6 +94,19 @@ define i1 @test_struct_gep(ptr %arg, i32 %index) { ret i1 %res } +define i1 @test_shufflevector(<2 x i1> %args.0, <2 x i1> %args.1) { +; CHECK-LABEL: define {{[^@]+}}@test_shufflevector +; CHECK-SAME: (<2 x i32> [[ARGS_0:%.*]], <2 x i32> [[ARGS_1:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[ARGS_0]], <2 x i32> [[ARGS_1]], <1 x i32> +; CHECK-NEXT: [[RES2:%.*]] = extractelement <1 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[RES2]] to i1 +; CHECK-NEXT: ret i1 [[TMP1]] +; + %tmp = shufflevector <2 x i1> %args.0, <2 x i1> %args.1, <1 x i32> + %res = extractelement <1 x i1> %tmp, i32 0 + ret i1 %res +} + define void @test_pointee_metadata(<7 x i1>, ptr) !types !1 { ; CHECK-LABEL: define {{[^@]+}}@test_pointee_metadata ; CHECK-SAME: (<7 x i32> [[TMP0:%.*]], ptr [[TMP1:%.*]]) !types [[META2:![0-9]+]] { diff --git a/compilerutils/test/value-origin-tracking/assumptions.ll b/compilerutils/test/value-origin-tracking/assumptions.ll new file mode 100644 index 0000000000..874630a20c --- /dev/null +++ b/compilerutils/test/value-origin-tracking/assumptions.ll @@ -0,0 +1,72 @@ +; RUN: opt -passes="value-origin-tracking-test" -S %s | FileCheck %s + +declare void @analyze(...) + +; Intrinsic to declare value origin assumptions. +; Syntax: +; call void @assume(%val, [constantOrDynamicValue], i32 dynamicValueByteOffset, [...]) +declare void @assume(...) + +declare i32 @opaque() + +define void @testSimpleAssumptions(i32 %arg) { +; CHECK-LABEL: testSimpleAssumptions + + %opaque = call i32 @opaque() +; CHECK: %opaque = call i32 @opaque()): Dynamic: %opaque = {{.*}} (offset 0) + call void @analyze(i32 %opaque) + + %opaque.with.constant.assumption = call i32 @opaque() + call void @assume(i32 %opaque.with.constant.assumption, i32 u0xbeef, i32 0) +; CHECK: %opaque.with.constant.assumption = call i32 @opaque()): Constant: 0xbeef + call void @analyze(i32 %opaque.with.constant.assumption) + + %opaque.with.dynamic.assumption = call i32 @opaque() + call void @assume(i32 %opaque.with.dynamic.assumption, i32 %arg, i32 0) +; CHECK: %opaque.with.dynamic.assumption = call i32 @opaque()): Dynamic (argument): i32 %arg (offset 0) + call void @analyze(i32 %opaque.with.dynamic.assumption) + + %opaque.with.self.assumption = call i32 @opaque() + call void @assume(i32 %opaque.with.self.assumption, i32 %opaque.with.self.assumption, i32 0) +; CHECK: %opaque.with.self.assumption = call i32 @opaque()): Dynamic: %opaque.with.self.assumption {{.*}} (offset 0) + call void @analyze(i32 %opaque.with.self.assumption) + + %opaque.with.nested.assumption = call i32 @opaque() + call void @assume(i32 %opaque.with.nested.assumption, i32 %opaque.with.dynamic.assumption, i32 0) +; CHECK: %opaque.with.nested.assumption = call i32 @opaque()): Dynamic (argument): i32 %arg (offset 0) + call void @analyze(i32 %opaque.with.nested.assumption) + + %derived = bitcast i32 %opaque.with.nested.assumption to float +; CHECK: %derived = bitcast i32 %opaque.with.nested.assumption to float): Dynamic (argument): i32 %arg (offset 0) + call void @analyze(float %derived) + + ; Test that we currently don't merge assumptions with our own analysis on the same value: + ; A trivial assumption can lead to worse results. + %trivial = bitcast i32 0 to float + call void @assume(float %trivial, float %trivial, i32 0) +; CHECK: %trivial = bitcast i32 0 to float): Dynamic: %trivial + call void @analyze(float %trivial) + + ret void +} + +declare [3 x i32] @permute([3 x i32] %arr) + +; Test assumptions on larger types with nontrivial offsets +; Add assumptions assuming that @permute permutes the input array. +; After three rounds, we should get back the original one. +define void @testAssumptionsWithOffsets([3 x i32] %arg) { +; CHECK-LABEL: testAssumptionsWithOffsets + %permuted.0 = call [3 x i32] @permute([3 x i32] %arg) + call void @assume([3 x i32] %permuted.0, [3 x i32] %arg, i32 4, [3 x i32] %arg, i32 8, [3 x i32] %arg, i32 0) + %permuted.1 = call [3 x i32] @permute([3 x i32] %permuted.0) + call void @assume([3 x i32] %permuted.1, [3 x i32] %permuted.0, i32 4, [3 x i32] %permuted.0, i32 8, [3 x i32] %permuted.0, i32 0) +; CHECK: %permuted.1 = {{.*}}: Dynamic (argument): [3 x i32] %arg (offset 8); Dynamic (argument): [3 x i32] %arg (offset 0); Dynamic (argument): [3 x i32] %arg (offset 4) + call void @analyze([3 x i32] %permuted.1) + %permuted.final = call [3 x i32] @permute([3 x i32] %permuted.1) + call void @assume([3 x i32] %permuted.final, [3 x i32] %permuted.1, i32 4, [3 x i32] %permuted.1, i32 8, [3 x i32] %permuted.1, i32 0) +; CHECK: %permuted.final = {{.*}}: Dynamic (argument): [3 x i32] %arg (offset 0); Dynamic (argument): [3 x i32] %arg (offset 4); Dynamic (argument): [3 x i32] %arg (offset 8) + call void @analyze([3 x i32] %permuted.final) + + ret void +} diff --git a/compilerutils/test/value-origin-tracking/basic-tests.ll b/compilerutils/test/value-origin-tracking/basic-tests.ll new file mode 100644 index 0000000000..d722c02f77 --- /dev/null +++ b/compilerutils/test/value-origin-tracking/basic-tests.ll @@ -0,0 +1,317 @@ +; RUN: opt -passes="value-origin-tracking-test" -S %s | FileCheck %s + +declare void @analyze(...) + +define void @testConstantInt() { +; CHECK-LABEL: testConstantInt + +; CHECK: (i1 true): Constant: 0x1 + call void @analyze(i1 true) + +; CHECK: (i8 16): Constant: 0x10 + call void @analyze(i8 16) + +; CHECK: (i16 17): Constant: 0x11 + call void @analyze(i16 17) + +; CHECK: (i32 64): Constant: 0x40 + call void @analyze(i32 64) + +; CHECK: (i64 4294967311): Constant: 0xf; Constant: 0x1 + call void @analyze(i64 u0x10000000f) + + ret void +} + +define void @testConstantFloat() { +; CHECK-LABEL: testConstantFloat + +; CHECK: (half 0xH1234): Constant: 0x1234 + call void @analyze(half 0xH1234) + +; CHECK: (float 1.250000e-01): Constant: 0x3e000000 + call void @analyze(float 1.250000e-01) + +; CHECK: (double 0x123456789ABCDEF): Constant: 0x89abcdef; Constant: 0x1234567 + call void @analyze(double 0x0123456789abcdef) + +; Check that float "zero" is not incorrectly handled as "null" +; CHECK: (float -0.000000e+00): Constant: 0x80000000 + call void @analyze(float -0.0) + +; CHECK: (float 1.250000e-01): Constant: 0x3e000000 + call void @analyze(float bitcast (i32 u0x3e000000 to float)) + ret void +} + +define void @testConstantVector() { +; CHECK-LABEL: testConstantVector + +; CHECK: (<2 x i32> zeroinitializer): Constant: 0x0; Constant: 0x0 + call void @analyze(<2 x i32> zeroinitializer) + +; CHECK: (<9 x i8> zeroinitializer): Constant: 0x0; Constant: 0x0; Constant: 0x0 + call void @analyze(<9 x i8> zeroinitializer) + +; CHECK: (<1 x i32> ): Constant: 0xdeadbeef + call void @analyze(<1 x i32> ) + +; CHECK: (<4 x i8> ): Constant: 0x4030201 + call void @analyze(<4 x i8> ) + +; CHECK: (<1 x float> ): Constant: 0x3e000000 + call void @analyze(<1 x float> ) + +; computeKnownBits only supports integer vectors, and our +; handling doesn't support smaller-than-slice element types. +; CHECK: (<1 x half> ): Dynamic + call void @analyze(<1 x half> ) + +; CHECK: (<4 x float> ): Constant: 0x0; Constant: 0x43800000; Constant: 0x0; UndefOrPoison + call void @analyze(<4 x float> ) + + ret void +} + +define void @testConstantArray() { +; CHECK-LABEL: testConstantArray + +; CHECK: ([2 x i32] zeroinitializer): Constant: 0x0; Constant: 0x0 + call void @analyze([2 x i32] zeroinitializer) + +; CHECK: ([9 x i8] zeroinitializer): Constant: 0x0; Constant: 0x0; Constant: 0x0 + call void @analyze([9 x i8] zeroinitializer) + +; CHECK: ([1 x i32] [i32 -559038737]): Constant: 0xdeadbeef + call void @analyze([1 x i32] [i32 u0xdeadbeef]) + +; In contrast to vectors, we can't detect constant arrays of small types. +; This is because llvm computeKnownBits supports vectors but not arrays, +; and our handling of constant arrays/vectors doesn't support element types +; smaller than slices. +; CHECK: ([4 x i8] c"\01\02\03\04"): Dynamic + call void @analyze([4 x i8] [i8 1, i8 2, i8 3, i8 4]) + +; CHECK: ([1 x float] [float 1.250000e-01]): Constant: 0x3e000000 + call void @analyze([1 x float] [float 1.250000e-01]) + +; CHECK: ([4 x float] [float 0.000000e+00, float 2.560000e+02, float 0.000000e+00, float undef]): Constant: 0x0; Constant: 0x43800000; Constant: 0x0; UndefOrPoison + call void @analyze([4 x float] [float 0.000000e+00, float bitcast (i32 u0x43800000 to float), float 0.000000e+00, float undef]) + + ret void +} + +%somestruct = type { i32, i8, half } +define void @testConstantStruct() { +; CHECK-LABEL: testConstantStruct +; Only support zeroinitializer for now + +; CHECK: (%somestruct zeroinitializer): Constant: 0x0; Constant: 0x0 + call void @analyze(%somestruct zeroinitializer) + +; CHECK: (%somestruct { i32 1, i8 1, half 0xH0000 }): Dynamic: {{.*}} (offset 0); Dynamic: {{.*}} (offset 4) + call void @analyze(%somestruct { i32 1, i8 1, half 0xH0 }) + + ret void +} + +define void @testDynamic(i32 %arg) { +; CHECK-LABEL: testDynamic +; CHECK: (i32 %arg): Dynamic (argument): i32 %arg (offset 0) + call void @analyze(i32 %arg) + ret void +} + +define void @testPoison() { +; CHECK-LABEL: testPoison +; CHECK: (i1 poison): UndefOrPoison + call void @analyze(i1 poison) +; CHECK: (i32 poison): UndefOrPoison + call void @analyze(i32 poison) +; CHECK: (double poison): UndefOrPoison; UndefOrPoison + call void @analyze(double poison) + + %freezePoison = freeze i32 poison +; CHECK: ( %freezePoison = {{.*}}): UndefOrPoison + call void @analyze(i32 %freezePoison) + + %freezeNonPoison = freeze i32 5 +; CHECK: ( %freezeNonPoison = {{.*}}): Constant: 0x5 + call void @analyze(i32 %freezeNonPoison) + ret void +} + +define void @testArray(i32 %arg) { +; CHECK-LABEL: testArray + %arr.1 = insertvalue [3 x i32] poison, i32 100, 0 + %arr.2 = insertvalue [3 x i32] %arr.1, i32 %arg, 1 + %extract.0 = extractvalue [3 x i32] %arr.2, 0 + %extract.1 = extractvalue [3 x i32] %arr.2, 1 + %extract.2 = extractvalue [3 x i32] %arr.2, 2 +; CHECK: ( %extract.0 = extractvalue [3 x i32] %arr.2, 0): Constant: 0x64 + call void @analyze(i32 %extract.0) +; CHECK: ( %extract.1 = extractvalue [3 x i32] %arr.2, 1): Dynamic (argument): i32 %arg (offset 0) + call void @analyze(i32 %extract.1) +; CHECK: ( %extract.2 = extractvalue [3 x i32] %arr.2, 2): UndefOrPoison + call void @analyze(i32 %extract.2) + ret void +} + +define void @testVector(i32 %arg) { +; CHECK-LABEL: testVector + %vec.1 = insertelement <3 x i32> poison, i32 100, i32 0 + %vec.2 = insertelement <3 x i32> %vec.1, i32 %arg, i32 1 + %extract.0 = extractelement <3 x i32> %vec.2, i32 0 + %extract.1 = extractelement <3 x i32> %vec.2, i32 1 + %extract.2 = extractelement <3 x i32> %vec.2, i32 2 + %extract.dyn = extractelement <3 x i32> %vec.2, i32 %arg +; CHECK: ( %extract.0 = extractelement <3 x i32> %vec.2, i32 0): Constant: 0x64 + call void @analyze(i32 %extract.0) +; CHECK: ( %extract.1 = extractelement <3 x i32> %vec.2, i32 1): Dynamic (argument): i32 %arg (offset 0) + call void @analyze(i32 %extract.1) +; CHECK: ( %extract.2 = extractelement <3 x i32> %vec.2, i32 2): UndefOrPoison + call void @analyze(i32 %extract.2) +; CHECK: ( %extract.dyn = extractelement <3 x i32> %vec.2, i32 %arg): Dynamic: %extract.dyn = extractelement <3 x i32> %vec.2, i32 %arg (offset 0) + call void @analyze(i32 %extract.dyn) + + ; Test that inserting an i1 into the middle of a dword doesn't accidentally overwrite high bits + %insert.i1 = insertelement <32 x i1> zeroinitializer, i1 1, i32 16 +; CHECK: ( %insert.i1 = {{.*}}): Dynamic + call void @analyze(<32 x i1> %insert.i1) + + ret void +} + +define void @testBitcast(i32 %arg) { +; CHECK-LABEL: testBitcast + %bitcast = bitcast i32 %arg to float +; CHECK: ( %bitcast = bitcast i32 %arg to float): Dynamic (argument): i32 %arg (offset 0) + call void @analyze(float %bitcast) + ret void +} + +define void @testSelect(i32 %arg1, i1 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, i1 %arg6, i1 %arg7) { +; CHECK-LABEL: testSelect + %sel.1 = select i1 %arg2, i32 %arg1, i32 -1 +; CHECK: ( %sel.1 = select i1 %arg2, i32 %arg1, i32 -1): (Constant: 0xffffffff | Dynamic (argument): i32 %arg1 (offset 0)) + call void @analyze(i32 %sel.1) + +; Consistent constant, in different order: + %sel.2 = select i1 %arg3, i32 -1, i32 %sel.1 +; CHECK: ( %sel.2 = select i1 %arg3, i32 -1, i32 %sel.1): (Constant: 0xffffffff | Dynamic (argument): i32 %arg1 (offset 0)) + call void @analyze(i32 %sel.2) + +; Inconsistent constants mean we don't know anything: + %sel.3 = select i1 %arg5, i32 %sel.2, i32 0 +; CHECK: ( %sel.3 = select i1 %arg5, i32 %sel.2, i32 0): Dynamic: %sel.3 = select i1 %arg5, i32 %sel.2, i32 0 (offset 0) + call void @analyze(i32 %sel.3) + +; Consistent dynamic value: + %arg1.bc.float = bitcast i32 %arg1 to float + %arg1.bc.i32 = bitcast float %arg1.bc.float to i32 + %sel.4 = select i1 %arg6, i32 %sel.2, i32 %arg1.bc.i32 +; CHECK: ( %sel.4 = select i1 %arg6, i32 %sel.2, i32 %arg1.bc.i32): (Constant: 0xffffffff | Dynamic (argument): i32 %arg1 (offset 0)) + call void @analyze(i32 %sel.4) + +; Inconsistent dynamic value means we don't know anything + %sel.5 = select i1 %arg6, i32 %sel.2, i32 %arg4 +; CHECK: ( %sel.5 = select i1 %arg6, i32 %sel.2, i32 %arg4): Dynamic: %sel.5 = select i1 %arg6, i32 %sel.2, i32 %arg4 (offset 0) + call void @analyze(i32 %sel.5) + +; Add in poison: + %sel.6 = select i1 %arg7, i32 %sel.2, i32 poison +; CHECK: ( %sel.6 = select i1 %arg7, i32 %sel.2, i32 poison): (UndefOrPoison | Constant: 0xffffffff | Dynamic (argument): i32 %arg1 (offset 0)) + call void @analyze(i32 %sel.6) + + ret void +} + +define void @testPhi(i32 %arg1, i1 %arg2, i1 %arg3, [5 x i32] %arg4) { +; CHECK-LABEL: testPhi +entry: + %empty = phi i32 +; CHECK: ( %empty = phi i32 ): Dynamic: %empty = phi i32 (offset 0) + call void @analyze(i32 %empty) + + br i1 %arg2, label %bb1, label %bb2 +bb1: + %phi.arg = phi i32 [ %arg1, %entry ] +; CHECK: ( %phi.arg = phi i32 [ %arg1, %entry ]): Dynamic (argument): i32 %arg1 (offset 0) + call void @analyze(i32 %phi.arg) + br label %bb2 +bb2: + %phi.argOrConst = phi i32 [ %arg1, %entry ], [ 1, %bb1] +; CHECK: ( %phi.argOrConst = phi i32 [ %arg1, %entry ], [ 1, %bb1 ]): (Constant: 0x1 | Dynamic (argument): i32 %arg1 (offset 0)) + call void @analyze(i32 %phi.argOrConst) + br label %loop.entry +loop.entry: + %phi.loop.constant = phi i32 [ 1, %bb2 ], [ 1, %loop.entry] + %phi.loop.propagate = phi i32 [ 1, %bb2 ], [ %phi.loop.propagate, %loop.entry] + +; CHECK: ( %phi.loop.constant = phi i32 [ 1, %bb2 ], [ 1, %loop.entry ]): Constant: 0x1 + call void @analyze(i32 %phi.loop.constant) +; %phi.loop.propagate is always constant, but figuring this out requires propagating +; multiple times through the loop until a stable state is reached, which we don't do: +; CHECK: ( %phi.loop.propagate = {{.*}}: Dynamic: %phi.loop.propagate = phi + call void @analyze(i32 %phi.loop.propagate) + br i1 %arg3, label %loop.entry, label %bb.startmany +bb.startmany: + switch i32 %arg1, label %exit [ i32 0, label %bb.many.0 + i32 1, label %bb.many.1 + i32 2, label %bb.many.2 + i32 3, label %bb.many.3 + i32 4, label %bb.many.4 ] +bb.many.0: + %arr.0 = insertvalue [5 x i32] %arg4, i32 0, 0 + br label %bb.many.exit +bb.many.1: + %arr.1 = insertvalue [5 x i32] %arg4, i32 1, 1 + br label %bb.many.exit +bb.many.2: + %arr.2 = insertvalue [5 x i32] %arg4, i32 2, 2 + br label %bb.many.exit +bb.many.3: + %arr.3 = insertvalue [5 x i32] %arg4, i32 3, 3 + br label %bb.many.exit +bb.many.4: + %arr.4 = insertvalue [5 x i32] %arg4, i32 4, 4 + br label %bb.many.exit +bb.many.exit: + %arr.phi = phi [5 x i32] [ %arr.0, %bb.many.0 ], [ %arr.1, %bb.many.1 ], [ %arr.2, %bb.many.2 ], [ %arr.3, %bb.many.3 ], [ %arr.4, %bb.many.4 ] +; CHECK: ( %arr.phi = phi {{.*}}): (Constant: 0x0 | Dynamic (argument): [5 x i32] %arg4 (offset 0)); (Constant: 0x1 | Dynamic (argument): [5 x i32] %arg4 (offset 4)); (Constant: 0x2 | Dynamic (argument): [5 x i32] %arg4 (offset 8)); (Constant: 0x3 | Dynamic (argument): [5 x i32] %arg4 (offset 12)); (Constant: 0x4 | Dynamic (argument): [5 x i32] %arg4 (offset 16)) + call void @analyze([5 x i32] %arr.phi) + br label %exit +exit: + ret void +} + +; This is a regression test against an earlier problem with the order in which we analyze +; values. We need to processed operands before processing an instruction itself, i.e. in an topological order. +; If there are cycles, we need to give up on some dependencies (supposedly only back dependencies to phi nodes). +define void @testProcessOrder(i32 %arg1, i1 %cond) { +; CHECK-LABEL: testProcessOrder +; This fails with DFS order: We push a and b to the stack when checking c. +; Then we process b, and see that a is already on the stack, so we don't push a to the stack again. +; After having processed arg1, b is on top of the stack, so we pop it and analyze it, but a is still unknown. + %a = select i1 %cond, i32 %arg1, i32 7 + %b = select i1 %cond, i32 %a, i32 7 + %c = select i1 %cond, i32 %a, i32 %b +; CHECK: ( %c = select {{.*}}): (Constant: 0x7 | Dynamic (argument): i32 %arg1 (offset 0)) + call void @analyze(i32 %c) + ret void +} + +; For unsupported instructions (e.g. add), we try to use computeKnownBits as last fallback. +; This allows to detect some simple cases as well. +define void @testDynamicComputeKnownBits(i32 %arg1, i1 %cond) { +; CHECK-LABEL: testDynamicComputeKnownBits + %add = add i32 1, 2 +; CHECK: ( %add = add {{.*}}): Constant: 0x3 + call void @analyze(i32 %add) + +; computeKnownBits only supports integers: +; CHECK: ( %fadd = fadd {{.*}}): Dynamic + %fadd = fadd float 1.0, 2.0 + call void @analyze(float %fadd) + ret void +} diff --git a/compilerutils/test/value-origin-tracking/max-value-size.ll b/compilerutils/test/value-origin-tracking/max-value-size.ll new file mode 100644 index 0000000000..9170031314 --- /dev/null +++ b/compilerutils/test/value-origin-tracking/max-value-size.ll @@ -0,0 +1,14 @@ +; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-max-bytes-per-value=4 | FileCheck %s --check-prefix=CHECK-SMALL +; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-max-bytes-per-value=1024 | FileCheck %s --check-prefix=CHECK-HUGE + +declare void @analyze(...) + +define void @test() { +; CHECK-LABEL: test + %arr = insertvalue [256 x i32] poison, i32 7, 255 + %val = extractvalue [256 x i32] %arr, 255 +; CHECK-SMALL: ( %val = extractvalue [256 x i32] %arr, 255): Dynamic: %val = extractvalue [256 x i32] %arr, 255 (offset 0) +; CHECK-HUGE: ( %val = extractvalue [256 x i32] %arr, 255): Constant: 0x7 + call void @analyze(i32 %val) + ret void +} diff --git a/compilerutils/test/value-origin-tracking/slice-sizes.ll b/compilerutils/test/value-origin-tracking/slice-sizes.ll new file mode 100644 index 0000000000..147a6ce1bc --- /dev/null +++ b/compilerutils/test/value-origin-tracking/slice-sizes.ll @@ -0,0 +1,73 @@ +; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-bytes-per-slice=1 | FileCheck %s --check-prefix=CHECK1 +; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-bytes-per-slice=4 | FileCheck %s --check-prefix=CHECK4 + +declare void @analyze(...) + +define void @testConstant() { +; CHECK-LABEL: testConstant +; CHECK1: (i32 -5601263): Constant: 0x11; Constant: 0x88; Constant: 0xaa; Constant: 0xff +; CHECK4: (i32 -5601263): Constant: 0xffaa8811 + call void @analyze(i32 u0xffaa8811) + ret void +} + +define void @testArray(i8 %arg) { +; CHECK-LABEL: testArray + %arr.1 = insertvalue [3 x i8] poison, i8 u0xff, 0 + %arr.2 = insertvalue [3 x i8] %arr.1, i8 poison, 1 + %arr.3 = insertvalue [3 x i8] %arr.2, i8 %arg, 2 +; CHECK1: ( %arr.3 = {{.*}}: Constant: 0xff; UndefOrPoison; Dynamic (argument): i8 %arg (offset 0) +; CHECK4: ( %arr.3 = {{.*}}: Dynamic: {{.*}} (offset 0) + call void @analyze([3 x i8] %arr.3) + ret void +} + +; Check that inserting a value into a range that is not slice-aligned invalidates +; the affected slices, but preserves the other ones. +; We insert the i16 at index 3 into this packed struct, which covers bytes 7 and 8. +; This touches two dwords, so with dword-sized slices the two middle dwords are dynamic. +; Byte-sized slices however nicely deal with it. +%packed.struct = type <{i32, i16, i8, i16, i16, i8, i32 }> +; Indices: 0 1 2 3 4 5 6 +; Byte ranges: 0..3 4..5 6..6 7..8 9..10 11..11 12..15 +; interesting value: ----^^^^ +define void @testMisalignedInsertExtract() { +; CHECK-LABEL: testMisalignedInsertExtract +; CHECK1: ( %inserted.3 = {{.*}}): Constant: 0xff; Constant: 0xff; Constant: 0xff; Constant: 0xff; Constant: 0x0; Constant: 0x0; Constant: 0x1; Constant: 0xff; Constant: 0xff +; CHECK1-SAME: Constant: 0x0; Constant: 0x0; Constant: 0x0; Constant: 0x0; Constant: 0x0; Constant: 0x0; Constant: 0x0 +; CHECK4: ( %inserted.3 = {{.*}}): Constant: 0xffffffff; Dynamic: {{.*}}; Dynamic: {{.*}}; Constant: 0x0 + %inserted.1 = insertvalue %packed.struct zeroinitializer, i32 -1, 0 + %inserted.2 = insertvalue %packed.struct %inserted.1, i8 1, 2 + %inserted.3 = insertvalue %packed.struct %inserted.2, i16 -1, 3 + call void @analyze(%packed.struct %inserted.3) + +; CHECK1: ( %extracted = {{.*}}): Constant: 0x0 +; CHECK4: ( %extracted = {{.*}}): Dynamic + %extracted = extractvalue %packed.struct zeroinitializer, 3 + call void @analyze(i16 %extracted) + + ret void +} + +; Test that inserting/extracting a value that is slice-aligned but smaller than a slice works correctly +; We insert/extract the i16 at index 1 in this struct: +%packed.struct.1 = type <{i32, i16, i16, i32 }> +; interesting value: ----^^^^ +define void @testAlignedSubSliceInsertExtract() { +; CHECK-LABEL: testAlignedSubSliceInsertExtract + %inserted.1 = insertvalue %packed.struct.1 zeroinitializer, i32 -1, 0 + %extracted.1 = extractvalue %packed.struct.1 %inserted.1, 1 + %inserted.2 = insertvalue %packed.struct.1 %inserted.1, i16 1, 1 + %extracted.2 = extractvalue %packed.struct.1 %inserted.2, 1 + +; CHECK1: ( %extracted.1 = {{.*}}): Constant: 0x0; Constant: 0x0 +; CHECK4: ( %extracted.1 = {{.*}}): Constant: 0x0 + call void @analyze(i16 %extracted.1) + +; CHECK1: ( %extracted.2 = {{.*}}): Constant: 0x1; Constant: 0x0 +; We don't support partial insertions, so treat this conservatively: +; CHECK4: ( %extracted.2 = {{.*}}): Dynamic + call void @analyze(i16 %extracted.2) + + ret void +} diff --git a/compilerutils/test/value-origin-tracking/vector.ll b/compilerutils/test/value-origin-tracking/vector.ll new file mode 100644 index 0000000000..b7aa3bb5cc --- /dev/null +++ b/compilerutils/test/value-origin-tracking/vector.ll @@ -0,0 +1,68 @@ +; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-bytes-per-slice=1 | FileCheck %s --check-prefix=CHECK1 +; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-bytes-per-slice=4 | FileCheck %s --check-prefix=CHECK4 +; +; Test vector ops on types that aren't byte-aligned (i1) and overaligned (i16) +target datalayout = "i16:32" + +declare void @analyze(...) + +define void @testi1(i32 %arg) { +; CHECK-LABEL: testi1 + %vec.0 = insertelement <16 x i1> poison, i1 1, i32 0 + %vec.1 = insertelement <16 x i1> %vec.0, i1 0, i32 1 + %vec.2 = insertelement <16 x i1> %vec.1, i1 1, i32 2 + %vec.3 = insertelement <16 x i1> %vec.2, i1 0, i32 3 + %vec.4 = insertelement <16 x i1> %vec.3, i1 1, i32 4 + %vec.5 = insertelement <16 x i1> %vec.4, i1 0, i32 5 + %vec.6 = insertelement <16 x i1> %vec.5, i1 1, i32 6 + %vec.7 = insertelement <16 x i1> %vec.6, i1 0, i32 7 + %vec.8 = insertelement <16 x i1> %vec.7, i1 1, i32 8 + %vec.9 = insertelement <16 x i1> %vec.8, i1 0, i32 9 + %vec.10 = insertelement <16 x i1> %vec.9, i1 1, i32 10 + call void @analyze(<16 x i1> %vec.10) +; CHECK1: ( %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10): Dynamic: %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10 (offset 0); Dynamic: %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10 (offset 1) +; CHECK4: ( %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10): Dynamic: %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10 (offset 0) + ret void +} + +define void @testi1InsertExtract() { +; CHECK-LABEL: testi1Extract +; We don't support sub-byte inserts/extractions yet, as demonstrated in this test + %vec.1 = bitcast i32 -1 to <32 x i1> + %extract.1 = extractelement <32 x i1> %vec.1, i32 0 +; CHECK: ( %extract.1 = extractelement <32 x i1> %vec.1, i32 0): Dynamic: {{.*}} (offset 0) + call void @analyze(i1 %extract.1) + %vec.2 = bitcast i32 0 to <32 x i1> + %vec.3 = insertelement <32 x i1> %vec.2, i1 1, i32 8 + call void @analyze(<32 x i1> %vec.3) +; CHECK: ( %vec.3 = insertelement <32 x i1> %vec.2, i1 true, i32 8): Dynamic: {{.*}} (offset 0); Dynamic: {{.*}} (offset 1); Dynamic: {{.*}} (offset 2); Dynamic: {{.*}} (offset 3) + ret void +} + +define void @testi16(i32 %arg) { +; CHECK-LABEL: testi16 + %vec.1 = insertelement <4 x i16> poison, i16 -1, i32 0 + %vec.2 = insertelement <4 x i16> %vec.1, i16 0, i32 1 +; CHECK1: %vec.2 = {{.*}}: Constant: 0xff; Constant: 0xff; Constant: 0x0; Constant: 0x0; UndefOrPoison; UndefOrPoison; UndefOrPoison; UndefOrPoison +; Sub-slice extract/insert isn't supported: +; CHECK4: %vec.2 = {{.*}}: Dynamic + call void @analyze(<4 x i16> %vec.2) +; CHECK1: %extract.1 = {{.*}}): Constant: 0xff; Constant: 0xff +; CHECK4: %extract.1 = {{.*}}: Dynamic + %extract.1 = extractelement <4 x i16> %vec.2, i32 0 + call void @analyze(i16 %extract.1) +; CHECK1: %extract.2 = {{.*}}): Constant: 0x0; Constant: 0x0 +; CHECK4: %extract.2 = {{.*}}: Dynamic + %extract.2 = extractelement <4 x i16> %vec.2, i32 1 + call void @analyze(i16 %extract.2) + ret void +} + +; Regression test for computeKnownBits handling of vectors +define void @testShuffleVector(i32 %arg) { +; CHECK-LABEL: testShuffleVector + %vec = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> +; CHECK: %vec = shufflevector {{.*}}: Dynamic + call void @analyze(<2 x i32> %vec) + ret void +} diff --git a/compilerutils/test/value-specialization/specialization.ll b/compilerutils/test/value-specialization/specialization.ll new file mode 100644 index 0000000000..87ca44f057 --- /dev/null +++ b/compilerutils/test/value-specialization/specialization.ll @@ -0,0 +1,318 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; +; RUN: opt --verify-each -passes='value-specialization-test' -S %s | FileCheck %s +; +; Intentionally align i64 to 64 bits so we can test specializations within types with padding, +; and align float to 16 bits to test misaligned dword-sized scalars. +target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:16-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32" + +; Syntax: +; call void @specialize(i32 %flags, %val, i32 %dw0Kind, i32 %dw0Constant, [...]) +; flag bits: +; skip value tracking check: 0x1 +; allow replacement failures: 0x2 (if not set, fail if any dword replacement on this value fails) +; Kind values: +; None: 0 +; Constant: 1 +; FrozenPoison: 2 +declare void @specialize(...) +declare void @use(...) + +define void @SimpleScalars(i32 %arg0, i32 %arg1, i32 %arg2, float %arg3) { +; CHECK-LABEL: define void @SimpleScalars( +; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], float [[ARG3:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 poison +; CHECK-NEXT: call void (...) @use(i32 [[ARG0]], i32 42, i32 [[TMP1]], float 0x3744E40000000000) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, i32 %arg0, i32 0, i32 poison) + call void @specialize(i32 0, i32 %arg1, i32 1, i32 42) + call void @specialize(i32 0, i32 %arg2, i32 2, i32 poison) + call void @specialize(i32 0, float %arg3, i32 1, i32 1337) + call void (...) @use(i32 %arg0, i32 %arg1, i32 %arg2, float %arg3) + ret void +} + +; I64 specialization is "special", as we potentially specialize low and high dwords separately. +; Test all non-trivial combinations: +; (low dword) (high dword) +; * arg0: None + Constant +; * arg1: None + FrozenPoison +; * arg2: Constant + None +; * arg3: Constant + FrozenPoison +; * arg4: FrozenPoison + None +; * arg5: FrozenPoison + Constant +; as well as uniform ones: +; * arg6: Constant + Constant +; * arg7: FrozenPoison + FrozenPoison +; +; Don't check with value tracking (flags=1) as it does not support the used bitwise operations. +define void @I64s(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7) { +; CHECK-LABEL: define void @I64s( +; CHECK-SAME: i64 [[ARG0:%.*]], i64 [[ARG1:%.*]], i64 [[ARG2:%.*]], i64 [[ARG3:%.*]], i64 [[ARG4:%.*]], i64 [[ARG5:%.*]], i64 [[ARG6:%.*]], i64 [[ARG7:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[ARG0]], 4294967295 +; CHECK-NEXT: [[ARG0_SPECIALIZED:%.*]] = or i64 [[TMP1]], 4294967296 +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[ARG1]], 4294967295 +; CHECK-NEXT: [[TMP3:%.*]] = freeze i64 poison +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], -4294967296 +; CHECK-NEXT: [[ARG1_SPECIALIZED:%.*]] = or i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = and i64 [[ARG2]], -4294967296 +; CHECK-NEXT: [[ARG2_SPECIALIZED:%.*]] = or i64 2, [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = freeze i64 poison +; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], -4294967296 +; CHECK-NEXT: [[ARG3_SPECIALIZED:%.*]] = or i64 3, [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = freeze i64 poison +; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 4294967295 +; CHECK-NEXT: [[TMP10:%.*]] = and i64 [[ARG4]], -4294967296 +; CHECK-NEXT: [[ARG4_SPECIALIZED:%.*]] = or i64 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = freeze i64 poison +; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP11]], 4294967295 +; CHECK-NEXT: [[ARG5_SPECIALIZED:%.*]] = or i64 [[TMP12]], 17179869184 +; CHECK-NEXT: [[TMP13:%.*]] = freeze i64 poison +; CHECK-NEXT: call void (...) @use(i64 [[ARG0_SPECIALIZED]], i64 [[ARG1_SPECIALIZED]], i64 [[ARG2_SPECIALIZED]], i64 [[ARG3_SPECIALIZED]], i64 [[ARG4_SPECIALIZED]], i64 [[ARG5_SPECIALIZED]], i64 25769803781, i64 [[TMP13]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 1, i64 %arg0, i32 0, i32 poison, i32 1, i32 1) + call void @specialize(i32 1, i64 %arg1, i32 0, i32 poison, i32 2, i32 poison) + call void @specialize(i32 1, i64 %arg2, i32 1, i32 2, i32 0, i32 poison) + call void @specialize(i32 1, i64 %arg3, i32 1, i32 3, i32 2, i32 poison) + call void @specialize(i32 1, i64 %arg4, i32 2, i32 poison, i32 0, i32 poison) + call void @specialize(i32 1, i64 %arg5, i32 2, i32 poison, i32 1, i32 4) + call void @specialize(i32 1, i64 %arg6, i32 1, i32 5, i32 1, i32 6) + call void @specialize(i32 1, i64 %arg7, i32 2, i32 poison, i32 2, i32 poison) + call void (...) @use(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7) + ret void +} + +define void @Double(double %arg) { +; CHECK-LABEL: define void @Double( +; CHECK-SAME: double [[ARG:%.*]]) { +; CHECK-NEXT: call void (...) @use(double 2.075080e-322) +; CHECK-NEXT: ret void +; + call void @specialize(i32 1, double %arg, i32 1, i32 42, i32 1, i32 0) + call void (...) @use(double %arg) + ret void +} + +; ptr is 64 bits wide, ptr addrspace (20) is 32 bits wide +define void @Pointers(ptr %arg0, ptr addrspace(20) %arg1) { +; CHECK-LABEL: define void @Pointers( +; CHECK-SAME: ptr [[ARG0:%.*]], ptr addrspace(20) [[ARG1:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[ARG0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296 +; CHECK-NEXT: [[TMP3:%.*]] = or i64 42, [[TMP2]] +; CHECK-NEXT: [[ARG0_SPECIALIZED:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK-NEXT: call void (...) @use(ptr [[ARG0_SPECIALIZED]], ptr addrspace(20) inttoptr (i32 43 to ptr addrspace(20))) +; CHECK-NEXT: ret void +; + call void @specialize(i32 1, ptr %arg0, i32 1, i32 42, i32 0, i32 poison) + call void @specialize(i32 1, ptr addrspace(20) %arg1, i32 1, i32 43) + call void (...) @use(ptr %arg0, ptr addrspace(20) %arg1) + ret void +} + +define void @Array([3 x i32] %args) { +; CHECK-LABEL: define void @Array( +; CHECK-SAME: [3 x i32] [[ARGS:%.*]]) { +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertvalue [3 x i32] [[ARGS]], i32 42, 1 +; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 poison +; CHECK-NEXT: [[ARGS_SPECIALIZED1:%.*]] = insertvalue [3 x i32] [[ARGS_SPECIALIZED]], i32 [[TMP1]], 2 +; CHECK-NEXT: call void (...) @use([3 x i32] [[ARGS_SPECIALIZED1]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, [3 x i32] %args, i32 0, i32 poison, i32 1, i32 42, i32 2, i32 poison) + call void (...) @use([3 x i32] %args) + ret void +} + +define void @Struct({ i32, i32, i32 } %args) { +; CHECK-LABEL: define void @Struct( +; CHECK-SAME: { i32, i32, i32 } [[ARGS:%.*]]) { +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, i32, i32 } [[ARGS]], i32 42, 1 +; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 poison +; CHECK-NEXT: [[ARGS_SPECIALIZED1:%.*]] = insertvalue { i32, i32, i32 } [[ARGS_SPECIALIZED]], i32 [[TMP1]], 2 +; CHECK-NEXT: call void (...) @use({ i32, i32, i32 } [[ARGS_SPECIALIZED1]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, { i32, i32, i32 } %args, i32 0, i32 poison, i32 1, i32 42, i32 2, i32 poison) + call void (...) @use({ i32, i32, i32 } %args) + ret void +} + +define void @Vector(<3 x i32> %args) { +; CHECK-LABEL: define void @Vector( +; CHECK-SAME: <3 x i32> [[ARGS:%.*]]) { +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertelement <3 x i32> [[ARGS]], i32 42, i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 poison +; CHECK-NEXT: [[ARGS_SPECIALIZED1:%.*]] = insertelement <3 x i32> [[ARGS_SPECIALIZED]], i32 [[TMP1]], i64 2 +; CHECK-NEXT: call void (...) @use(<3 x i32> [[ARGS_SPECIALIZED1]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, <3 x i32> %args, i32 0, i32 poison, i32 1, i32 42, i32 2, i32 poison) + call void (...) @use(<3 x i32> %args) + ret void +} + +; Test that when replacing some but not all dwords of a nested struct, we directly insertvalue into the outer struct +define void @NestedStructPartialReplace({ i32, { i32, i32 } } %args) { +; CHECK-LABEL: define void @NestedStructPartialReplace( +; CHECK-SAME: { i32, { i32, i32 } } [[ARGS:%.*]]) { +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, { i32, i32 } } [[ARGS]], i32 42, 1, 0 +; CHECK-NEXT: call void (...) @use({ i32, { i32, i32 } } [[ARGS_SPECIALIZED]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, { i32, { i32, i32 } } %args, i32 0, i32 poison, i32 1, i32 42, i32 0, i32 poison) + call void (...) @use({ i32, { i32, i32 } } %args) + ret void +} + +; Test that when replacing some but not all dwords of a nested vector, we first extract the old vector, +; insert replacements, and then insert the replaced vector +define void @NestedVectorWithPartialReplace({ i32, <2 x i32>} %args) { +; CHECK-LABEL: define void @NestedVectorWithPartialReplace( +; CHECK-SAME: { i32, <2 x i32> } [[ARGS:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, <2 x i32> } [[ARGS]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 42, i64 0 +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, <2 x i32> } [[ARGS]], <2 x i32> [[TMP2]], 1 +; CHECK-NEXT: call void (...) @use({ i32, <2 x i32> } [[ARGS_SPECIALIZED]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, { i32, <2 x i32>} %args, i32 0, i32 poison, i32 1, i32 42, i32 0, i32 poison) + call void (...) @use({ i32, <2 x i32>} %args) + ret void +} + +; Test that when replacing multiple but not all dwords of a nested vector, we first extract the old vector, +; insert all replacements, and then insert the replaced vector just once +define void @NestedVectorWithPartialMultiReplace({ i32, <3 x i32>} %args) { +; CHECK-LABEL: define void @NestedVectorWithPartialMultiReplace( +; CHECK-SAME: { i32, <3 x i32> } [[ARGS:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, <3 x i32> } [[ARGS]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x i32> [[TMP1]], i32 42, i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 43, i64 1 +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, <3 x i32> } [[ARGS]], <3 x i32> [[TMP3]], 1 +; CHECK-NEXT: call void (...) @use({ i32, <3 x i32> } [[ARGS_SPECIALIZED]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, { i32, <3 x i32>} %args, i32 0, i32 poison, i32 1, i32 42, i32 1, i32 43, i32 0, i32 poison) + call void (...) @use({ i32, <3 x i32>} %args) + ret void +} + +; Test that when replacing all dwords of a nested vector, we inserted the replacement values +; into a new frozen poison vector, and then insertvalue that into the struct. +define void @NestedVectorWithFullReplace({ i32, <2 x i32>} %args) { +; CHECK-LABEL: define void @NestedVectorWithFullReplace( +; CHECK-SAME: { i32, <2 x i32> } [[ARGS:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = freeze <2 x i32> poison +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 42, i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 43, i64 1 +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, <2 x i32> } [[ARGS]], <2 x i32> [[TMP3]], 1 +; CHECK-NEXT: call void (...) @use({ i32, <2 x i32> } [[ARGS_SPECIALIZED]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, { i32, <2 x i32>} %args, i32 0, i32 poison, i32 1, i32 42, i32 1, i32 43) + call void (...) @use({ i32, <2 x i32>} %args) + ret void +} + +; There is a padding dword before the nested struct, because i64 is 64-bit aligned. +; Check that replacing dword index 4 correctly replaces the nested i32. +define void @NestedStructWithPadding({ i32, { i64, i32 } } %args) { +; CHECK-LABEL: define void @NestedStructWithPadding( +; CHECK-SAME: { i32, { i64, i32 } } [[ARGS:%.*]]) { +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, { i64, i32 } } [[ARGS]], i32 42, 1, 1 +; CHECK-NEXT: call void (...) @use({ i32, { i64, i32 } } [[ARGS_SPECIALIZED]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, { i32, { i64, i32 } } %args, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 1, i32 42, i32 0, i32 poison) + call void (...) @use({ i32 , { i64, i32 } } %args) + ret void +} + +define void @NestedAll({ i32, [ 2 x { i32, <2 x i32> } ] } %args) { +; CHECK-LABEL: define void @NestedAll( +; CHECK-SAME: { i32, [2 x { i32, <2 x i32> }] } [[ARGS:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, [2 x { i32, <2 x i32> }] } [[ARGS]], 1, 1, 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 42, i64 1 +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, [2 x { i32, <2 x i32> }] } [[ARGS]], <2 x i32> [[TMP2]], 1, 1, 1 +; CHECK-NEXT: call void (...) @use({ i32, [2 x { i32, <2 x i32> }] } [[ARGS_SPECIALIZED]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 0, { i32, [ 2 x { i32, <2 x i32> } ] } %args, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 1, i32 42) + call void (...) @use({ i32, [ 2 x { i32, <2 x i32> } ] } %args) + ret void +} + +define void @FailSmallTypes(i1 %arg0, i8 %arg1, i16 %arg2, half %arg3) { +; CHECK-LABEL: define void @FailSmallTypes( +; CHECK-SAME: i1 [[ARG0:%.*]], i8 [[ARG1:%.*]], i16 [[ARG2:%.*]], half [[ARG3:%.*]]) { +; CHECK-NEXT: call void (...) @use(i1 [[ARG0]], i8 [[ARG1]], i16 [[ARG2]], half [[ARG3]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 3, i1 %arg0, i32 1, i32 1) + call void @specialize(i32 3, i8 %arg1, i32 1, i32 1) + call void @specialize(i32 3, i16 %arg2, i32 1, i32 1) + call void @specialize(i32 3, half %arg3, i32 1, i32 1) + call void (...) @use(i1 %arg0, i8 %arg1, i16 %arg2, half %arg3) + ret void +} + +; These are not supported yet, but we could add support later. It would require splitting constant values though. +define void @FailSmallTypesInAggregates(<2 x i16> %arg0, [2 x i16] %arg1) { +; CHECK-LABEL: define void @FailSmallTypesInAggregates( +; CHECK-SAME: <2 x i16> [[ARG0:%.*]], [2 x i16] [[ARG1:%.*]]) { +; CHECK-NEXT: call void (...) @use(<2 x i16> [[ARG0]], [2 x i16] [[ARG1]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 3, <2 x i16> %arg0, i32 1, i32 1) + call void @specialize(i32 3, [2 x i16] %arg1, i32 1, i32 1) + call void (...) @use(<2 x i16> %arg0, [2 x i16] %arg1) + ret void +} + +; Test that replacing into the storage of a misaligned dword-sized scalar fails +; Replacing the first float succeeds, because it is dword-aligned, the second replacement should fail. +define void @FailMisalignedDwordScalar({ float, i16, float, float, i16 } %args) { +; CHECK-LABEL: define void @FailMisalignedDwordScalar( +; CHECK-SAME: { float, i16, float, float, i16 } [[ARGS:%.*]]) { +; CHECK-NEXT: [[ARGS_SPECIALIZED:%.*]] = insertvalue { float, i16, float, float, i16 } [[ARGS]], float 0x36F5000000000000, 0 +; CHECK-NEXT: call void (...) @use({ float, i16, float, float, i16 } [[ARGS_SPECIALIZED]]) +; CHECK-NEXT: ret void +; + call void @specialize(i32 3, { float, i16, float, float, i16 } %args, i32 1, i32 42, i32 0, i32 poison, i32 1, i32 43, i32 0, i32 poison) + call void (...) @use({ float, i16, float, float, i16 } %args) + ret void +} + +; Specialize a value in control flow, testing that we insert instructions at the correct place. +define void @ControlFlow([2 x i32] %arg0, i1 %arg1, i1 %arg2) { +; CHECK-LABEL: define void @ControlFlow( +; CHECK-SAME: [2 x i32] [[ARG0:%.*]], i1 [[ARG1:%.*]], i1 [[ARG2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[ARG1]], label [[LOOP:%.*]], label [[EXIT:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[LOOPDEP:%.*]] = phi [2 x i32] [ [[ARG0]], [[ENTRY:%.*]] ], [ [[INSERTED_SPECIALIZED:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[EXTRACT:%.*]] = extractvalue [2 x i32] [[LOOPDEP]], 0 +; CHECK-NEXT: [[INCR:%.*]] = add i32 [[EXTRACT]], 1 +; CHECK-NEXT: [[INSERTED:%.*]] = insertvalue [2 x i32] [[LOOPDEP]], i32 [[INCR]], 0 +; CHECK-NEXT: [[INSERTED_SPECIALIZED]] = insertvalue [2 x i32] [[INSERTED]], i32 42, 1 +; CHECK-NEXT: call void (...) @use([2 x i32] [[INSERTED_SPECIALIZED]]) +; CHECK-NEXT: br i1 [[ARG2]], label [[LOOP]], label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br i1 %arg1, label %loop, label %exit +loop: + %loopdep = phi [2 x i32] [ %arg0, %entry ], [ %inserted, %loop ] + %extract = extractvalue [2 x i32] %loopdep, 0 + %incr = add i32 %extract, 1 + %inserted = insertvalue [2 x i32] %loopdep, i32 %incr, 0 + call void @specialize(i32 0, [2 x i32] %inserted, i32 0, i32 poison, i32 1, i32 42) + call void (...) @use([2 x i32] %inserted) + br i1 %arg2, label %loop, label %exit +exit: + ret void +} diff --git a/docs/DdnDebugPrintf.md b/docs/DdnDebugPrintf.md index 9e4036b28e..5fea0b6feb 100644 --- a/docs/DdnDebugPrintf.md +++ b/docs/DdnDebugPrintf.md @@ -206,7 +206,7 @@ Example: #### lgc::LowerDebugPrintf -The module pass `LowerDebugPrintf` runs just before `PatchEntryPointMutate`. +The module pass `LowerDebugPrintf` runs just before `MutateEntryPoint`. It collects all calls to `@lgc.debug.printf` in the entire module and: * Collects the format strings and adds the `amdpal.format_strings` entry to the diff --git a/imported/llvm-dialects b/imported/llvm-dialects index 6ff7d39046..bdfb113d8d 160000 --- a/imported/llvm-dialects +++ b/imported/llvm-dialects @@ -1 +1 @@ -Subproject commit 6ff7d39046e280e446fd69aa08c6c6524c68c728 +Subproject commit bdfb113d8d765bdf4554a2b30ae909b93f26aeea diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h index d1bfd86971..e22e24e337 100644 --- a/include/vkgcDefs.h +++ b/include/vkgcDefs.h @@ -435,6 +435,27 @@ enum class ResourceLayoutScheme : unsigned { /// 3. descriptor set index for each set }; +/// Specifies compile-time values for a single slot in a constant buffer. +struct CompileTimeConst { + uint32_t offset; ///< Which constant buffer slot (i.e., vec4) contains these values. + uint32_t set; ///< Uniform set. + uint32_t binding; ///< Uniform binding. + uint32_t validBytes; ///< Mask of which bytes in the values array are valid (are provided by the caller). This + ///< is not a mask but the actual valid bytes count from first bit of 'values' as LLPC + ///< could provide enough type info.. + union { + uint32_t u32[4]; ///< The compile-time values as an array of 32-bit values (x, y, z, w). + uint16_t u16[8]; ///< The compile-time values as an array of 16-bit values. + uint8_t u8[16]; ///< The compile-time values as an array of 8-bit values. + } values; ///< The compile-time values for this slot. +}; + +/// Represents info of compile-time constants within a shader of a specified stage. +struct CompileConstInfo { + unsigned numCompileTimeConstants; ///< Number of compile time constants. + CompileTimeConst *pCompileTimeConstants; ///< Actual compile time constants data, for uniform value replacement. +}; + /// Represents per pipeline options. struct PipelineOptions { bool includeDisassembly; ///< If set, the disassembly for all compiled shaders will be included in @@ -515,6 +536,7 @@ struct PipelineOptions { bool enablePrimGeneratedQuery; ///< If set, primitive generated query is enabled bool disablePerCompFetch; ///< Disable per component fetch in uber fetch shader. bool reserved21; + CompileConstInfo *compileConstInfo; ///< Compile time constant data. }; /// Prototype of allocator for output data buffer, used in shader-specific operations. @@ -903,6 +925,9 @@ struct PipelineShaderOptions { /// Let dmask bits be fully enabled when call 'image.sample.c', for depth compare mode swizzling workaround. bool imageSampleDrefReturnsRgba; + + /// Application workaround: disable all fast math flags on gl_Position. + bool disableGlPositionOpt; }; /// Represents YCbCr sampler meta data in resource descriptor @@ -1315,6 +1340,7 @@ struct GraphicsPipelineBuildInfo { NggState nggState; ///< NGG state used for tuning and debugging PipelineOptions options; ///< Per pipeline tuning/debugging options bool unlinked; ///< True to build an "unlinked" half-pipeline ELF + bool enableInitUndefZero; ///< True to initialize undefined variable bool dynamicVertexStride; ///< Dynamic Vertex input Stride is enabled. bool enableUberFetchShader; ///< Use uber fetch shader bool enableColorExportShader; ///< Explicitly build color export shader, UnlinkedStageFragment elf will diff --git a/include/vkgcGpurtShim.h b/include/vkgcGpurtShim.h index 2246b36702..64521f5af2 100644 --- a/include/vkgcGpurtShim.h +++ b/include/vkgcGpurtShim.h @@ -38,7 +38,7 @@ namespace Vkgc { namespace gpurt { #ifdef HAVE_GPURT_SHIM -void getShaderLibrarySpirv(unsigned featureFlags, const void *&code, size_t &size); +void getShaderLibrarySpirv(Vkgc::RtIpVersion rtIpVersion, unsigned featureFlags, const void *&code, size_t &size); void getFuncTable(Vkgc::RtIpVersion rtIpVersion, Vkgc::GpurtFuncTable &table); Vkgc::RtIpVersion getRtIpVersion(Vkgc::GfxIpVersion gfxIpVersion); #endif diff --git a/lgc/CMakeLists.txt b/lgc/CMakeLists.txt index cf3417aaae..0d0ba569b4 100644 --- a/lgc/CMakeLists.txt +++ b/lgc/CMakeLists.txt @@ -149,16 +149,16 @@ target_sources(LLVMlgc PRIVATE patch/NggPrimShader.cpp patch/Patch.cpp patch/PatchBufferOp.cpp - patch/PatchCheckShaderCache.cpp - patch/PatchCopyShader.cpp - patch/PatchEntryPointMutate.cpp - patch/PatchImageDerivatives.cpp - patch/PatchInOutImportExport.cpp - patch/PatchInvariantLoads.cpp - patch/PatchLlvmIrInclusion.cpp - patch/PatchLoadScalarizer.cpp - patch/PatchMulDx9Zero.cpp - patch/PatchLoopMetadata.cpp + patch/CheckShaderCache.cpp + patch/GenerateCopyShader.cpp + patch/MutateEntryPoint.cpp + patch/LowerImageDerivatives.cpp + patch/LowerInOut.cpp + patch/LowerInvariantLoads.cpp + patch/IncludeLlvmIr.cpp + patch/ScalarizeLoads.cpp + patch/LowerMulDx9Zero.cpp + patch/AddLoopMetadata.cpp patch/PatchNullFragShader.cpp patch/PatchPeepholeOpt.cpp patch/PatchPreparePipelineAbi.cpp @@ -172,7 +172,7 @@ target_sources(LLVMlgc PRIVATE patch/ShaderMerger.cpp patch/SystemValues.cpp patch/VertexFetch.cpp - patch/PatchImageOpCollect.cpp + patch/CollectImageOperations.cpp patch/RegisterMetadataBuilder.cpp #if VKI_BUILD_STRIX1 patch/WorkaroundDsSubdwordWrite.cpp diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp index 088f655ed2..3023c51823 100644 --- a/lgc/builder/BuilderImpl.cpp +++ b/lgc/builder/BuilderImpl.cpp @@ -76,21 +76,18 @@ Type *BuilderBase::getConditionallyVectorizedTy(Type *elementTy, Type *maybeVecT Value *BuilderImpl::CreateDotProduct(Value *const vector1, Value *const vector2, const Twine &instName) { if (vector1->getType()->getScalarType()->isBFloatTy()) { assert(getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 11); - // amdgcn_fdot2_bf16_bf16 will be used. + // Note: v_dot2_bf16_bf16 only respects RTE mode according to HW spec. We must check the specified rounding mode + // before using it. Also, v_dot2_bf16_bf16 doesn't respect signed zeros so we must check NSZ as well. const auto fp16RoundMode = getPipelineState()->getShaderModes()->getCommonShaderMode(m_shaderStage.value()).fp16RoundMode; const auto vectorTy = dyn_cast(vector1->getType()); - if (vectorTy && (fp16RoundMode == FpRoundMode::DontCare || fp16RoundMode == FpRoundMode::Even)) { + if (vectorTy && (fp16RoundMode == FpRoundMode::DontCare || fp16RoundMode == FpRoundMode::Even) && + getFastMathFlags().noSignedZeros()) { int compCount = vectorTy->getNumElements(); Value *result = nullptr; if (compCount % 2 == 0) { - // If all products are of the form +x * -0.0, then the result should be -0.0. This requires a -0.0 - // initial value. - // - // However, we prefer +0.0 as initial value when signed zeros are disabled because it can be encoded as an - // inline constant. - result = ConstantFP::get(getBFloatTy(), getFastMathFlags().noSignedZeros() ? +0.0 : -0.0); + result = ConstantFP::get(getBFloatTy(), 0.0); } else { // If the component count is odd, prefer feeding the last product (odd one out) as initial value. Value *lhs = CreateExtractElement(vector1, compCount - 1); diff --git a/lgc/builder/BuilderRecorder.cpp b/lgc/builder/BuilderRecorder.cpp index 280838a6f3..143a7b5a6c 100644 --- a/lgc/builder/BuilderRecorder.cpp +++ b/lgc/builder/BuilderRecorder.cpp @@ -178,6 +178,8 @@ StringRef BuilderRecorder::getCallName(BuilderOpcode opcode) { return "get.desc.ptr"; case BuilderOpcode::LoadPushConstantsPtr: return "load.push.constants.ptr"; + case BuilderOpcode::SamplerFeedbackDesc: + return "sampler.feedback.desc"; case BuilderOpcode::ReadGenericInput: return "read.generic.input"; case BuilderOpcode::ReadPerVertexInput: @@ -1084,6 +1086,16 @@ Value *Builder::CreateLoadPushConstantsPtr(const Twine &instName) { return record(BuilderOpcode::LoadPushConstantsPtr, getPtrTy(ADDR_SPACE_CONST), {}, instName); } +// ===================================================================================================================== +// Merges a resource descriptor into a feedback descriptor to create a descriptor for sampler feedback instructions. +// +// @param feedbackDesc : feedback descriptor +// @param resourceDesc : resource descriptor +Value *Builder::CreateSamplerFeedbackDesc(Value *feedbackDesc, Value *resourceDesc, const Twine &instName) { + return record(BuilderOpcode::SamplerFeedbackDesc, getDescTy(ResourceNodeType::DescriptorResource), + {feedbackDesc, resourceDesc}, instName); +} + // ===================================================================================================================== // Create an image load. // @@ -2043,6 +2055,7 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRefsetDoesNotAccessMemory(); break; diff --git a/lgc/builder/BuilderRecorder.h b/lgc/builder/BuilderRecorder.h index 8a9fd60521..cecb86a981 100644 --- a/lgc/builder/BuilderRecorder.h +++ b/lgc/builder/BuilderRecorder.h @@ -109,6 +109,7 @@ enum BuilderOpcode : unsigned { GetDescStride, GetDescPtr, LoadPushConstantsPtr, + SamplerFeedbackDesc, // Image ImageLoad, diff --git a/lgc/builder/BuilderReplayer.cpp b/lgc/builder/BuilderReplayer.cpp index 608b7014dc..b3e34c76bb 100644 --- a/lgc/builder/BuilderReplayer.cpp +++ b/lgc/builder/BuilderReplayer.cpp @@ -408,6 +408,10 @@ Value *BuilderReplayer::processCall(unsigned opcode, CallInst *call) { return m_builder->CreateLoadPushConstantsPtr(); } + case BuilderOpcode::SamplerFeedbackDesc: { + return m_builder->CreateSamplerFeedbackDesc(args[0], args[1]); + } + // Replayer implementations of ImageBuilder methods case BuilderOpcode::ImageLoad: { unsigned dim = cast(args[0])->getZExtValue(); diff --git a/lgc/builder/DescBuilder.cpp b/lgc/builder/DescBuilder.cpp index bf716ec257..0b1374d0af 100644 --- a/lgc/builder/DescBuilder.cpp +++ b/lgc/builder/DescBuilder.cpp @@ -254,7 +254,7 @@ Value *BuilderImpl::CreateGetDescPtr(ResourceNodeType concreteType, ResourceNode // or phi node, we rely on subsequent LLVM optimizations promoting the value back to a constant. StringRef startGlobalName = lgcName::ImmutableSamplerGlobal; std::string globalName = - (startGlobalName + Twine(node->set) + "_" + Twine(node->binding) + "_" + Twine(node->visibility)).str(); + (startGlobalName + Twine(node->set) + "_" + Twine(node->binding) + "_" + Twine(node->visibility.toRaw())).str(); Module *module = GetInsertPoint()->getModule(); descPtr = module->getGlobalVariable(globalName, /*AllowInternal=*/true); if (!descPtr) { diff --git a/lgc/builder/ImageBuilder.cpp b/lgc/builder/ImageBuilder.cpp index e24421b9f0..7e7aa56afb 100644 --- a/lgc/builder/ImageBuilder.cpp +++ b/lgc/builder/ImageBuilder.cpp @@ -650,7 +650,7 @@ Value *BuilderImpl::CreateImageLoad(Type *resultTy, unsigned dim, unsigned flags // Rectangle image uses the same Intrinsic ID with 2D image. Intrinsic::ID intrinsicId = (dim == DimRect) ? table[Dim2D] : table[dim]; - imageInst = CreateIntrinsic(intrinsicId, {intrinsicDataTy, coords[0]->getType()}, args, nullptr, instName); + imageInst = CreateIntrinsic(intrinsicDataTy, intrinsicId, args, nullptr, instName); } else { // Texel buffer descriptor. Use the buffer instruction. imageDescArgIndex = args.size(); @@ -867,7 +867,7 @@ Value *BuilderImpl::CreateImageStore(Value *texel, unsigned dim, unsigned flags, // Rectangle image uses the same Intrinsic ID with 2D image. Intrinsic::ID intrinsicId = (dim == DimRect) ? table[Dim2D] : table[dim]; - imageStore = CreateIntrinsic(intrinsicId, {texelTy, coords[0]->getType()}, args, nullptr, instName); + imageStore = CreateIntrinsic(getVoidTy(), intrinsicId, args, nullptr, instName); } else { // Texel buffer descriptor. Use the buffer instruction. // First widen texel to vec4 if necessary. @@ -1148,9 +1148,6 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign // Build the intrinsic arguments and overloaded types. SmallVector args; - SmallVector overloadTys; - if (resultTy && !resultTy->isVoidTy()) - overloadTys.push_back(resultTy); // Dmask. unsigned dmask = 15; @@ -1185,10 +1182,8 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign } // Bias: float - if (Value *biasVal = address[ImageAddressIdxLodBias]) { + if (Value *biasVal = address[ImageAddressIdxLodBias]) args.push_back(biasVal); - overloadTys.push_back(biasVal->getType()); - } // ZCompare (dref) if (Value *zCompareVal = address[ImageAddressIdxZCompare]) { @@ -1198,14 +1193,10 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign } // Grad (explicit derivatives) - if (!derivatives.empty()) { - args.insert(args.end(), derivatives.begin(), derivatives.end()); - overloadTys.push_back(derivatives[0]->getType()); - } + args.insert(args.end(), derivatives.begin(), derivatives.end()); // Coordinate args.insert(args.end(), coords.begin(), coords.end()); - overloadTys.push_back(coords[0]->getType()); // LodClamp if (Value *lodClampVal = address[ImageAddressIdxLodClamp]) @@ -1252,7 +1243,7 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign Intrinsic::ID intrinsicId = (dim == DimRect) ? table->ids[Dim2D] : table->ids[dim]; // Create the intrinsic. - Instruction *imageOp = CreateIntrinsic(intrinsicId, overloadTys, args, nullptr, instName); + Instruction *imageOp = CreateIntrinsic(resultTy, intrinsicId, args, nullptr, instName); // Add a waterfall loop if needed. SmallVector nonUniformArgIndexes; @@ -1368,8 +1359,12 @@ Value *BuilderImpl::CreateImageAtomicCommon(unsigned atomicOp, unsigned dim, uns // Rectangle image uses the same Intrinsic ID with 2D image. Intrinsic::ID intrinsicId = (dim == DimRect) ? ImageAtomicIntrinsicTable[atomicOp][Dim2D] : ImageAtomicIntrinsicTable[atomicOp][dim]; +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION >= 511095 + atomicInst = CreateIntrinsic(inputValue->getType(), intrinsicId, args, nullptr, instName); +#else atomicInst = CreateIntrinsic(intrinsicId, {inputValue->getType(), coord->getType()->getScalarType()}, args, nullptr, instName); +#endif } else { // Texel buffer descriptor. Use the buffer atomic instruction. args.push_back(inputValue); @@ -1660,8 +1655,8 @@ Value *BuilderImpl::CreateImageGetLod(unsigned dim, unsigned flags, Value *image args.push_back(getInt32(0)); // tfe/lwe args.push_back(getInt32(0)); // glc/slc - Instruction *result = CreateIntrinsic(ImageGetLodIntrinsicTable[dim], - {FixedVectorType::get(getFloatTy(), 2), getFloatTy()}, args, nullptr, instName); + Instruction *result = + CreateIntrinsic(FixedVectorType::get(getFloatTy(), 2), ImageGetLodIntrinsicTable[dim], args, nullptr, instName); SmallVector nonUniformArgIndexes; if (imageDesc->getType()->isVectorTy()) { @@ -2184,3 +2179,24 @@ Value *BuilderImpl::transformSamplerDesc(Value *samplerDesc) { cast(desc)->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(getContext(), {})); return desc; } + +// ===================================================================================================================== +// Merges a resource descriptor into a feedback descriptor to create a descriptor for sampler feedback instructions. +// +// @param feedbackDesc : feedback descriptor +// @param resourceDesc : resource descriptor +// @param instName : Name to give instruction(s) +// @returns Descriptor for use with sampler feedback image sample calls +Value *BuilderImpl::CreateSamplerFeedbackDesc(Value *feedbackDesc, Value *resourceDesc, const Twine &instName) { + GfxIpVersion gfxIp = getPipelineState()->getTargetInfo().getGfxIpVersion(); + SqImgRsrcRegHandler feedbackRsrc(this, feedbackDesc, &gfxIp); + SqImgRsrcRegHandler resourceRsrc(this, feedbackDesc, &gfxIp); + + feedbackRsrc.setReg(SqRsrcRegs::BaseLevel, resourceRsrc.getReg(SqRsrcRegs::BaseLevel)); + feedbackRsrc.setReg(SqRsrcRegs::LastLevel, resourceRsrc.getReg(SqRsrcRegs::LastLevel)); + feedbackRsrc.setReg(SqRsrcRegs::Depth, resourceRsrc.getReg(SqRsrcRegs::Depth)); + feedbackRsrc.setReg(SqRsrcRegs::BaseArray, resourceRsrc.getReg(SqRsrcRegs::BaseArray)); + feedbackRsrc.setReg(SqRsrcRegs::MinLod, resourceRsrc.getReg(SqRsrcRegs::MinLod)); + + return feedbackRsrc.getRegister(); +} diff --git a/lgc/builder/InOutBuilder.cpp b/lgc/builder/InOutBuilder.cpp index 9502c6af2e..44439f30a6 100644 --- a/lgc/builder/InOutBuilder.cpp +++ b/lgc/builder/InOutBuilder.cpp @@ -2033,33 +2033,35 @@ void BuilderImpl::markBuiltInOutputUsage(BuiltInKind builtIn, unsigned arraySize } case ShaderStage::Geometry: { - switch (builtIn) { - case BuiltInPointSize: - usage.gs.pointSize = true; - break; - case BuiltInPosition: - usage.gs.position = true; - break; - case BuiltInClipDistance: - usage.gs.clipDistance = std::max(usage.gs.clipDistance, arraySize); - break; - case BuiltInCullDistance: - usage.gs.cullDistance = std::max(usage.gs.cullDistance, arraySize); - break; - case BuiltInPrimitiveId: - usage.gs.primitiveId = true; - break; - case BuiltInViewportIndex: - usage.gs.viewportIndex = true; - break; - case BuiltInLayer: - usage.gs.layer = true; - break; - case BuiltInPrimitiveShadingRate: - usage.gs.primitiveShadingRate = true; - break; - default: - break; + if (streamId == m_pipelineState->getRasterizerState().rasterStream) { + switch (builtIn) { + case BuiltInPointSize: + usage.gs.pointSize = true; + break; + case BuiltInPosition: + usage.gs.position = true; + break; + case BuiltInClipDistance: + usage.gs.clipDistance = std::max(usage.gs.clipDistance, arraySize); + break; + case BuiltInCullDistance: + usage.gs.cullDistance = std::max(usage.gs.cullDistance, arraySize); + break; + case BuiltInPrimitiveId: + usage.gs.primitiveId = true; + break; + case BuiltInViewportIndex: + usage.gs.viewportIndex = true; + break; + case BuiltInLayer: + usage.gs.layer = true; + break; + case BuiltInPrimitiveShadingRate: + usage.gs.primitiveShadingRate = true; + break; + default: + break; + } } break; } diff --git a/lgc/builder/MatrixBuilder.cpp b/lgc/builder/MatrixBuilder.cpp index e3ed620d3e..1ad7306101 100644 --- a/lgc/builder/MatrixBuilder.cpp +++ b/lgc/builder/MatrixBuilder.cpp @@ -363,6 +363,8 @@ Type *BuilderCommon::transCooperativeMatrixElementType(CooperativeMatrixElementT case CooperativeMatrixElementType::Int32: return getInt32Ty(); case CooperativeMatrixElementType::Int8: + case CooperativeMatrixElementType::Float8: + case CooperativeMatrixElementType::BFloat8: return getInt8Ty(); default: llvm_unreachable("The element type is not supported."); @@ -411,6 +413,8 @@ bool BuilderCommon::isTypeNCooperativeMatrix(CooperativeMatrixElementType elemTy width = 32; break; case lgc::CooperativeMatrixElementType::Int8: + case lgc::CooperativeMatrixElementType::Float8: + case lgc::CooperativeMatrixElementType::BFloat8: width = 8; break; default: diff --git a/lgc/builder/SubgroupBuilder.cpp b/lgc/builder/SubgroupBuilder.cpp index 65490d9861..598097433e 100644 --- a/lgc/builder/SubgroupBuilder.cpp +++ b/lgc/builder/SubgroupBuilder.cpp @@ -1,4 +1,4 @@ -/* +/* *********************************************************************************************************************** * * Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All Rights Reserved. @@ -442,14 +442,7 @@ Value *BuilderImpl::createSubgroupShuffle(Value *const value, Value *const index return result; } - auto mapFunc = [this](BuilderBase &builder, ArrayRef mappedArgs, - ArrayRef passthroughArgs) -> Value * { - Value *const readlane = - builder.CreateIntrinsic(builder.getInt32Ty(), Intrinsic::amdgcn_readlane, {mappedArgs[0], passthroughArgs[0]}); - return createWaterfallLoop(cast(readlane), 1); - }; - - return CreateMapToSimpleType(mapFunc, value, index); + return createShuffleLoop(value, index, shaderStage); } // ===================================================================================================================== @@ -1418,6 +1411,26 @@ Value *BuilderImpl::createInverseBallotSelect(uint64_t selectMask, Value *const return CreateSelect(inverseBallot, value1, value2); } +// ===================================================================================================================== +// Do group ballot with all active threads participated, turning a boolean value (in a VGPR) into a subgroup-wide +// shared SGPR. +// +// @param value : The value to contribute to the SGPR, must be an boolean type. +Value *BuilderImpl::createGroupBallotAllActive(Value *const value) { + // Check the type is definitely an boolean. + assert(value->getType()->isIntegerTy(1)); + + Value *result = value; + unsigned waveSize = getShaderWaveSize(); + result = CreateIntrinsic(getIntNTy(waveSize), Intrinsic::amdgcn_ballot, result); + + // If we have a 32-bit subgroup size, we need to turn the 32-bit ballot result into a 64-bit result. + if (waveSize <= 32) + result = CreateZExt(result, getInt64Ty()); + + return result; +} + // ===================================================================================================================== // Do group ballot, turning a per-lane boolean value (in a VGPR) into a subgroup-wide shared SGPR. // @@ -1435,15 +1448,7 @@ Value *BuilderImpl::createGroupBallot(Value *const value, ShaderStageEnum shader auto isLive = CreateIntrinsic(Intrinsic::amdgcn_live_mask, {}, {}, nullptr, {}); result = CreateAnd(isLive, result); } - - unsigned waveSize = getShaderWaveSize(); - result = CreateIntrinsic(getIntNTy(waveSize), Intrinsic::amdgcn_ballot, result); - - // If we have a 32-bit subgroup size, we need to turn the 32-bit ballot result into a 64-bit result. - if (waveSize <= 32) - result = CreateZExt(result, getInt64Ty()); - - return result; + return createGroupBallotAllActive(result); } // ===================================================================================================================== @@ -1454,6 +1459,87 @@ Value *BuilderImpl::createGroupBallot(Value *const value) { return createGroupBallot(value, m_shaderStage.value()); } +// ===================================================================================================================== +// Create a traditional loop for subgroup shuffle. +// +// This is done in three steps: +// 1. Collect the active lane mask for loop condition. +// +// 2. Check whether the shuffle index of each lane is equal to the shuffle index of first lane. If so, update the value +// of the current lane. +// +// 3. Update the first lane by update work list. +// +// Pseudo code: +// result = poison +// workList = ballot(true) +// do { +// firstLaneIdx = find_first_set(workList) +// currentSrcLaneIdx = readlane(srcLaneIdx, firstLaneIdx) +// notCurrentLane = srcLaneIdx != currentSrcLaneIdx +// CreateMapToSimpleType +// value = readlane(srcData, currentSrcLaneIdx) +// result = notCurrentLane ? result : value +// workList &= ballot(notCurrentLane) +// } +// while (workList != 0) +// +// @param value : The value to shuffle. +// @param index : The index to shuffle from. +// @param instName : Name to give instruction(s) +llvm::Value *BuilderImpl::createShuffleLoop(llvm::Value *const value, llvm::Value *const index, + ShaderStageEnum shaderStage, const llvm::Twine &instName) { + assert(value != nullptr && index != nullptr); + // Return readlane directly, if the index is a constant value. + if (isa(index)) + return CreateIntrinsic(getInt32Ty(), Intrinsic::amdgcn_readlane, {value, index}); + + // Creat workList out of loop + // By implementation, the Insert point has been set to the callInst when call processCall + auto *loopPoint = &*(GetInsertPoint()); + auto *originalBlock = loopPoint->getParent(); + + // We are forcing all active threads participate the shuffle because CreateSubgroupClusteredMultiExclusive() + // depends on this to be correct. + // TODO: Refine the code or algorithm so that createShuffleLoop is no longer affected by external code + // implementations. + auto *workList = createGroupBallotAllActive(getTrue()); + + // Init loop block. + auto *loop = originalBlock->splitBasicBlock(loopPoint, ".shuffleLoop"); + auto *loopNext = loop->splitBasicBlock(loop->getFirstInsertionPt()); + SetInsertPoint(loop->getFirstInsertionPt()); + + Type *waveSize = workList->getType(); + auto *resultPhi = CreatePHI(value->getType(), 2); + auto *workListPhi = CreatePHI(workList->getType(), 2); + resultPhi->addIncoming(PoisonValue::get(value->getType()), originalBlock); + workListPhi->addIncoming(workList, originalBlock); + auto *firstLaneIndex = + CreateZExtOrTrunc(CreateIntrinsic(Intrinsic::cttz, waveSize, {workListPhi, getTrue()}), getInt32Ty()); + // In each loop iteration, the lanes with the same shuffle source index are being processed together. So, + // the iteration count will be equal to the count of unique values of the shuffle index. + Value *const currentSrcLaneIndex = + CreateIntrinsic(index->getType(), Intrinsic::amdgcn_readlane, {index, firstLaneIndex}); + auto *notCurrentLane = CreateICmpNE(index, currentSrcLaneIndex); + auto mapFunc = [](BuilderBase &builder, ArrayRef mappedArgs, ArrayRef passthroughArgs) -> Value * { + Value *const index = passthroughArgs[0]; + Value *const result = mappedArgs[0]; + Value *const srcDate = mappedArgs[1]; + Value *const value = builder.CreateIntrinsic(srcDate->getType(), Intrinsic::amdgcn_readlane, {srcDate, index}); + return builder.CreateSelect(passthroughArgs[1], result, value); + }; + auto result = CreateMapToSimpleType(mapFunc, {resultPhi, value}, {currentSrcLaneIndex, notCurrentLane}); + auto newWorkList = CreateAnd(createGroupBallotAllActive(notCurrentLane), workListPhi); + resultPhi->addIncoming(result, loop); + workListPhi->addIncoming(newWorkList, loop); + auto *cond = CreateICmpEQ(newWorkList, ConstantInt::get(waveSize, 0)); + CreateCondBr(cond, loopNext, loop); + loop->back().eraseFromParent(); + SetInsertPoint(loopPoint); + return result; +} + // ===================================================================================================================== // Search the MSB index of the mask, not handle zero. // diff --git a/lgc/disassembler/Disassembler.cpp b/lgc/disassembler/Disassembler.cpp index 8dc125c0a3..0466a4b548 100644 --- a/lgc/disassembler/Disassembler.cpp +++ b/lgc/disassembler/Disassembler.cpp @@ -257,7 +257,9 @@ void ObjDisassembler::run() { std::unique_ptr regInfo(m_target->createMCRegInfo(m_tripleName)); if (!regInfo) report_fatal_error(m_data.getBufferIdentifier() + ": No register info for target"); - std::unique_ptr asmInfo(m_target->createMCAsmInfo(*regInfo, m_tripleName, MCTargetOptions())); + MCTargetOptions targetOptions{}; + targetOptions.AsmVerbose = true; + std::unique_ptr asmInfo(m_target->createMCAsmInfo(*regInfo, m_tripleName, targetOptions)); if (!asmInfo) report_fatal_error(m_data.getBufferIdentifier() + ": No assembly info for target"); m_subtargetInfo.reset(m_target->createMCSubtargetInfo(m_tripleName, *mcpu, features.getString())); @@ -267,7 +269,7 @@ void ObjDisassembler::run() { if (!instrInfo) report_fatal_error(m_data.getBufferIdentifier() + ": No instruction info for target"); - MCContext context(triple, asmInfo.get(), regInfo.get(), m_subtargetInfo.get()); + MCContext context(triple, asmInfo.get(), regInfo.get(), m_subtargetInfo.get(), nullptr, &targetOptions); std::unique_ptr objFileInfo(m_target->createMCObjectFileInfo(context, /*PIC=*/false)); if (!objFileInfo) report_fatal_error("No MC object file info"); @@ -282,9 +284,12 @@ void ObjDisassembler::run() { report_fatal_error(m_data.getBufferIdentifier() + ": No instruction printer for target"); auto fostream = std::make_unique(m_ostream); +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 505779 m_streamer.reset(m_target->createAsmStreamer(*m_context, std::move(fostream), true, false, m_instPrinter, nullptr, nullptr, false)); - +#else + m_streamer.reset(m_target->createAsmStreamer(*m_context, std::move(fostream), m_instPrinter, nullptr, nullptr)); +#endif // Process each section. for (ELFSectionRef sectionRef : m_objFile->sections()) processSection(sectionRef); diff --git a/lgc/elfLinker/ElfLinker.cpp b/lgc/elfLinker/ElfLinker.cpp index 7fdda5b912..967675eda5 100644 --- a/lgc/elfLinker/ElfLinker.cpp +++ b/lgc/elfLinker/ElfLinker.cpp @@ -72,11 +72,6 @@ ElfLinkerImpl::ElfLinkerImpl(PipelineState *pipelineState, ArrayRef elfs); - // Destructor - ~ElfLinkerImpl() override final; - // ----------------------------------------------------------------------------------------------------------------- // Implementations of ElfLinker methods exposed to the front-end diff --git a/lgc/elfLinker/GlueShader.h b/lgc/elfLinker/GlueShader.h index 4bb2a42c15..6d4b8c102e 100644 --- a/lgc/elfLinker/GlueShader.h +++ b/lgc/elfLinker/GlueShader.h @@ -43,7 +43,7 @@ class LgcContext; // Base class for a glue shader (a fetch shader or parameter/color export shader generated during linking) class GlueShader { public: - virtual ~GlueShader() {} + virtual ~GlueShader() = default; // Create a color export shader static std::unique_ptr createColorExportShader(PipelineState *pipelineState, diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h index b1fd73c612..7603c29cf8 100644 --- a/lgc/include/lgc/builder/BuilderImpl.h +++ b/lgc/include/lgc/builder/BuilderImpl.h @@ -312,6 +312,10 @@ class BuilderImpl : public BuilderDefs { // Build buffer compact descriptor llvm::Value *buildBufferCompactDesc(llvm::Value *desc, unsigned stride); + // Build image sampler feedback descriptor + llvm::Value *CreateSamplerFeedbackDesc(llvm::Value *feedbackDesc, llvm::Value *resourceDesc, + const llvm::Twine &instName = ""); + private: // Get a struct containing the pointer and byte stride for a descriptor llvm::Value *getDescPtrAndStride(ResourceNodeType resType, uint64_t descSet, unsigned binding, @@ -791,10 +795,14 @@ class BuilderImpl : public BuilderDefs { uint16_t getDsSwizzleQuadMode(uint8_t lane0, uint8_t lane1, uint8_t lane2, uint8_t lane3); llvm::Value *createGroupBallot(llvm::Value *const value); + // Create a traditional loop for subgroup shuffle. + llvm::Value *createShuffleLoop(llvm::Value *const value, llvm::Value *const index, ShaderStageEnum shaderStage, + const llvm::Twine &instName = ""); protected: // The subgroup operation with explicit shader stage as parameter. llvm::Value *createFindMsb(llvm::Value *const mask); + llvm::Value *createGroupBallotAllActive(llvm::Value *const value); llvm::Value *createGroupBallot(llvm::Value *const value, ShaderStageEnum shaderStage); llvm::Value *createSubgroupBroadcastFirst(llvm::Value *const value, ShaderStageEnum shaderStage, const llvm::Twine &instName); diff --git a/lgc/include/lgc/patch/PatchLoopMetadata.h b/lgc/include/lgc/patch/AddLoopMetadata.h similarity index 99% rename from lgc/include/lgc/patch/PatchLoopMetadata.h rename to lgc/include/lgc/patch/AddLoopMetadata.h index b547dc7910..afe46f16ca 100644 --- a/lgc/include/lgc/patch/PatchLoopMetadata.h +++ b/lgc/include/lgc/patch/AddLoopMetadata.h @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchLoopMetadata.h + * @file AddLoopMetadata.h * @brief LLPC header file: contains declaration of class lgc::PatchLoopMetadata. *********************************************************************************************************************** */ diff --git a/lgc/include/lgc/patch/PatchCheckShaderCache.h b/lgc/include/lgc/patch/CheckShaderCache.h similarity index 89% rename from lgc/include/lgc/patch/PatchCheckShaderCache.h rename to lgc/include/lgc/patch/CheckShaderCache.h index eb454f6461..5d31f625a6 100644 --- a/lgc/include/lgc/patch/PatchCheckShaderCache.h +++ b/lgc/include/lgc/patch/CheckShaderCache.h @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchCheckShaderCache.h - * @brief LLPC header file: contains declaration of class lgc::PatchCheckShaderCache + * @file CheckShaderCache.h + * @brief LLPC header file: contains declaration of class lgc::CheckShaderCache *********************************************************************************************************************** */ #pragma once @@ -38,11 +38,11 @@ namespace lgc { // ===================================================================================================================== // Represents the pass of LLVM patching operations for checking shader cache -class PatchCheckShaderCache : public Patch, public llvm::PassInfoMixin { +class CheckShaderCache : public Patch, public llvm::PassInfoMixin { public: - PatchCheckShaderCache() {} + CheckShaderCache() {} - PatchCheckShaderCache(Pipeline::CheckShaderCacheFunc callbackFunc); + CheckShaderCache(Pipeline::CheckShaderCacheFunc callbackFunc); llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); diff --git a/lgc/include/lgc/patch/PatchImageOpCollect.h b/lgc/include/lgc/patch/CollectImageOperations.h similarity index 98% rename from lgc/include/lgc/patch/PatchImageOpCollect.h rename to lgc/include/lgc/patch/CollectImageOperations.h index b124ae15da..d4f563f37b 100644 --- a/lgc/include/lgc/patch/PatchImageOpCollect.h +++ b/lgc/include/lgc/patch/CollectImageOperations.h @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchImageOpCollect.h + * @file CollectImageOperations.h * @brief LLPC header file: contains declaration of class lgc::PatchImageOpCollect. *********************************************************************************************************************** */ diff --git a/lgc/include/lgc/patch/FragColorExport.h b/lgc/include/lgc/patch/FragColorExport.h index c7e949f547..1fcd756226 100644 --- a/lgc/include/lgc/patch/FragColorExport.h +++ b/lgc/include/lgc/patch/FragColorExport.h @@ -66,6 +66,7 @@ class FragColorExport { unsigned expFmt[MaxColorTargets]; // Export format used for "export" instruction. unsigned waveSize; // The wave size for fragment. bool enableFragColor; // Whether to broadcast frag color. Only for OGLP + ExportFormat dummyExpFmt; // Export format used for dummy "export" instruction. }; FragColorExport(LgcContext *context); @@ -74,7 +75,6 @@ class FragColorExport { bool dummyExport, PalMetadata *palMetadata, BuilderBase &builder, llvm::Value *dynamicIsDualSource, const Key &key); static void setDoneFlag(llvm::Value *exportInst, BuilderBase &builder); - static llvm::CallInst *addDummyExport(BuilderBase &builder); static llvm::Function *generateNullFragmentShader(llvm::Module &module, PipelineState *pipelineState, llvm::StringRef entryPointName); static llvm::Function *generateNullFragmentEntryPoint(llvm::Module &module, PipelineState *pipelineState, diff --git a/lgc/include/lgc/patch/PatchCopyShader.h b/lgc/include/lgc/patch/GenerateCopyShader.h similarity index 94% rename from lgc/include/lgc/patch/PatchCopyShader.h rename to lgc/include/lgc/patch/GenerateCopyShader.h index f52784a094..a01372dd45 100644 --- a/lgc/include/lgc/patch/PatchCopyShader.h +++ b/lgc/include/lgc/patch/GenerateCopyShader.h @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchCopyShader.h - * @brief LLPC header file: contains declaration of class lgc::PatchCopyShader. + * @file GenerateCopyShader.h + * @brief LLPC header file: contains declaration of class lgc::GenerateCopyShader. *********************************************************************************************************************** */ #pragma once @@ -40,7 +40,7 @@ namespace lgc { // ===================================================================================================================== // Pass to generate copy shader if required -class PatchCopyShader : public Patch, public llvm::PassInfoMixin { +class GenerateCopyShader : public Patch, public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); diff --git a/lgc/include/lgc/patch/PatchLlvmIrInclusion.h b/lgc/include/lgc/patch/IncludeLlvmIr.h similarity index 98% rename from lgc/include/lgc/patch/PatchLlvmIrInclusion.h rename to lgc/include/lgc/patch/IncludeLlvmIr.h index 1a253cabb9..b198d2c60a 100644 --- a/lgc/include/lgc/patch/PatchLlvmIrInclusion.h +++ b/lgc/include/lgc/patch/IncludeLlvmIr.h @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchLlvmIrInclusion.h + * @file IncludeLlvmIr.h * @brief LLPC header file: contains declaration of class lgc::PatchLlvmIrInclusion. *********************************************************************************************************************** */ diff --git a/lgc/include/lgc/patch/LowerGpuRt.h b/lgc/include/lgc/patch/LowerGpuRt.h index 5876d01cd7..c11e6a5cb8 100644 --- a/lgc/include/lgc/patch/LowerGpuRt.h +++ b/lgc/include/lgc/patch/LowerGpuRt.h @@ -66,13 +66,14 @@ class GpurtInitStaticIdOp; class LowerGpuRt : public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); + void updateWorkgroupSize(llvm::Function *func); private: typedef void (LowerGpuRt::*LibraryFuncPtr)(llvm::Function *, unsigned); const static unsigned MaxLdsStackEntries = 16; - uint32_t getWorkgroupSize() const; + unsigned getWorkgroupSize(llvm::Function *func) const; llvm::Value *getThreadIdInGroup() const; - void createGlobalStack(llvm::Module &module); + void createLdsStack(llvm::Module &module); void createRayStaticIdValue(); void visitGetStackSize(lgc::GpurtGetStackSizeOp &inst); void visitGetStackBase(lgc::GpurtGetStackBaseOp &inst); @@ -100,5 +101,6 @@ class LowerGpuRt : public llvm::PassInfoMixin { llvm::SmallSet m_funcsToLower; // Functions to lower Builder *m_builder = nullptr; unsigned m_rayStaticId = 0; + unsigned m_workGroupSize = 0; }; } // namespace lgc diff --git a/lgc/include/lgc/patch/PatchImageDerivatives.h b/lgc/include/lgc/patch/LowerImageDerivatives.h similarity index 92% rename from lgc/include/lgc/patch/PatchImageDerivatives.h rename to lgc/include/lgc/patch/LowerImageDerivatives.h index 7af727cfaf..2f1c024fa7 100644 --- a/lgc/include/lgc/patch/PatchImageDerivatives.h +++ b/lgc/include/lgc/patch/LowerImageDerivatives.h @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchImageDerivatives.h - * @brief LLPC header file: contains declaration of class lgc::PatchImageDerivatives. + * @file .h + * @brief LLPC header file: contains declaration of class lgc::LowerImageDerivatives. *********************************************************************************************************************** */ #pragma once @@ -38,7 +38,7 @@ namespace lgc { // ===================================================================================================================== // Represents the pass of LLVM patching operations for image operations -class PatchImageDerivatives : public llvm::PassInfoMixin { +class LowerImageDerivatives : public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); diff --git a/lgc/include/lgc/patch/PatchInOutImportExport.h b/lgc/include/lgc/patch/LowerInOut.h similarity index 99% rename from lgc/include/lgc/patch/PatchInOutImportExport.h rename to lgc/include/lgc/patch/LowerInOut.h index 22f4c1672f..e165006e49 100644 --- a/lgc/include/lgc/patch/PatchInOutImportExport.h +++ b/lgc/include/lgc/patch/LowerInOut.h @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchInOutImportExport.h + * @file LowerInOut.h * @brief LLPC header file: contains declaration of class lgc::PatchInOutImportExport. *********************************************************************************************************************** */ diff --git a/lgc/include/lgc/patch/PatchInvariantLoads.h b/lgc/include/lgc/patch/LowerInvariantLoads.h similarity index 89% rename from lgc/include/lgc/patch/PatchInvariantLoads.h rename to lgc/include/lgc/patch/LowerInvariantLoads.h index b241621c97..f3180ca0cc 100644 --- a/lgc/include/lgc/patch/PatchInvariantLoads.h +++ b/lgc/include/lgc/patch/LowerInvariantLoads.h @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchInvariantLoads.h - * @brief LLPC header file: contains declaration of class lgc::PatchInvariantLoads. + * @file LowerInvariantLoads.h + * @brief LLPC header file: contains declaration of class lgc::LowerInvariantLoads. *********************************************************************************************************************** */ #pragma once @@ -37,8 +37,8 @@ namespace lgc { // ===================================================================================================================== -// Represents the pass of LLVM patching operations for image operations -class PatchInvariantLoads : public llvm::PassInfoMixin { +// Represents the LLVM pass for patching operations to lower invariant loads +class LowerInvariantLoads : public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Function &function, llvm::FunctionAnalysisManager &analysisManager); diff --git a/lgc/include/lgc/patch/PatchMulDx9Zero.h b/lgc/include/lgc/patch/LowerMulDx9Zero.h similarity index 99% rename from lgc/include/lgc/patch/PatchMulDx9Zero.h rename to lgc/include/lgc/patch/LowerMulDx9Zero.h index 47861db8df..400c447187 100644 --- a/lgc/include/lgc/patch/PatchMulDx9Zero.h +++ b/lgc/include/lgc/patch/LowerMulDx9Zero.h @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchMulDx9Zero.h + * @file LowerMulDx9Zero.h * @brief LLPC header file: contains declaration of class lgc::PatchMulDx9Zero. *********************************************************************************************************************** */ diff --git a/lgc/include/lgc/patch/PatchEntryPointMutate.h b/lgc/include/lgc/patch/MutateEntryPoint.h similarity index 96% rename from lgc/include/lgc/patch/PatchEntryPointMutate.h rename to lgc/include/lgc/patch/MutateEntryPoint.h index e56338b3ef..639f3fca53 100644 --- a/lgc/include/lgc/patch/PatchEntryPointMutate.h +++ b/lgc/include/lgc/patch/MutateEntryPoint.h @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchEntryPointMutate.h - * @brief LLPC header file: contains declaration of class lgc::PatchEntryPointMutate. + * @file MutateEntryPoint.h + * @brief LLPC header file: contains declaration of class lgc::MutateEntryPoint. *********************************************************************************************************************** */ #pragma once @@ -48,9 +48,9 @@ class UserDataOp; // ===================================================================================================================== // The entry-point mutation pass -class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin { +class MutateEntryPoint : public Patch, public llvm::PassInfoMixin { public: - PatchEntryPointMutate(); + MutateEntryPoint(); llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); static llvm::StringRef name() { return "Patch LLVM for entry-point mutation"; } @@ -86,7 +86,7 @@ class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin, ShaderStage::Count> m_userDataUsage; + ShaderStageMap> m_userDataUsage; class CpsShaderInputCache { public: diff --git a/lgc/include/lgc/patch/Patch.h b/lgc/include/lgc/patch/Patch.h index f78951b524..c938e26da2 100644 --- a/lgc/include/lgc/patch/Patch.h +++ b/lgc/include/lgc/patch/Patch.h @@ -49,8 +49,8 @@ class PassManager; // Represents the pass of LLVM patching operations, as the base class. class Patch { public: - Patch() : m_module(nullptr), m_context(nullptr), m_shaderStage(ShaderStage::Invalid), m_entryPoint(nullptr) {} - virtual ~Patch() {} + Patch() : m_module(nullptr), m_context(nullptr), m_entryPoint(nullptr) {} + virtual ~Patch() = default; static void addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, llvm::Timer *patchTimer, llvm::Timer *optTimer, Pipeline::CheckShaderCacheFunc checkShaderCacheFunc, uint32_t optLevel); @@ -68,10 +68,10 @@ class Patch { void init(llvm::Module *module); - llvm::Module *m_module; // LLVM module to be run on - llvm::LLVMContext *m_context; // Associated LLVM context of the LLVM module that passes run on - ShaderStageEnum m_shaderStage; // Shader stage - llvm::Function *m_entryPoint; // Entry-point + llvm::Module *m_module; // LLVM module to be run on + llvm::LLVMContext *m_context; // Associated LLVM context of the LLVM module that passes run on + std::optional m_shaderStage; // Shader stage + llvm::Function *m_entryPoint; // Entry-point }; } // namespace lgc diff --git a/lgc/include/lgc/patch/PatchLoadScalarizer.h b/lgc/include/lgc/patch/ScalarizeLoads.h similarity index 98% rename from lgc/include/lgc/patch/PatchLoadScalarizer.h rename to lgc/include/lgc/patch/ScalarizeLoads.h index f29ed9e4c4..670667e58f 100644 --- a/lgc/include/lgc/patch/PatchLoadScalarizer.h +++ b/lgc/include/lgc/patch/ScalarizeLoads.h @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchLoadScalarizer.h + * @file ScalarizeLoads.h * @brief LLPC header file: contains declaration of class lgc::PatchLoadScalarizer. *********************************************************************************************************************** */ diff --git a/lgc/include/lgc/patch/ShaderInputs.h b/lgc/include/lgc/patch/ShaderInputs.h index 390cc43699..d7b139a53f 100644 --- a/lgc/include/lgc/patch/ShaderInputs.h +++ b/lgc/include/lgc/patch/ShaderInputs.h @@ -145,12 +145,12 @@ enum class ShaderInput : unsigned { // ===================================================================================================================== // Class for handling shader inputs (other than user data) // -// From BuilderImpl up to just before PatchEntryPointMutate, static methods in this class can be used to +// From BuilderImpl up to just before MutateEntryPoint, static methods in this class can be used to // generate code to access shader inputs. That generates an lgc.shader.input.* call for each access. // -// The PatchEntryPointMutate pass creates a ShaderInputs object, and uses a method on it to gather already- +// The MutateEntryPoint pass creates a ShaderInputs object, and uses a method on it to gather already- // generated uses of shader inputs, and another method to create arguments for the shader function based -// on that, and on usage that will happen after PatchEntryPointMutate. +// on that, and on usage that will happen after MutateEntryPoint. // // The resulting shader function has input arguments that represent a kind of idealized GFX8 shader, // before GFX9+ shader merging and/or GFX10+ NGG primitive shader formation. @@ -173,7 +173,7 @@ class ShaderInputs { static const char *getInputName(ShaderInput inputKind); // ------------------------------------------------------------------------------------------------------------------- - // Static methods called before PatchEntryPointMutate + // Static methods called before MutateEntryPoint // Get a special user data value by inserting a call to lgc.special.user.data static llvm::CallInst *getSpecialUserData(UserDataMapping kind, BuilderBase &builder); @@ -191,9 +191,9 @@ class ShaderInputs { static llvm::Value *getInput(ShaderInput kind, BuilderBase &builder, const LgcContext &lgcContext); // ------------------------------------------------------------------------------------------------------------------- - // Object methods called during PatchEntryPointMutate + // Object methods called during MutateEntryPoint - // Gather usage of shader inputs from before PatchEntryPointMutate + // Gather usage of shader inputs from before MutateEntryPoint void gatherUsage(llvm::Module &module); // Fix up uses of shader inputs to use entry args directly @@ -230,7 +230,7 @@ class ShaderInputs { // amdgpu-no-workgroup-id-* void tryOptimizeWorkgroupId(PipelineState *pipelineState, ShaderStageEnum shaderStage, llvm::Function *origFunc); - llvm::SmallVector m_shaderInputsUsage; + ShaderStageMap m_shaderInputsUsage; }; } // namespace lgc diff --git a/lgc/include/lgc/patch/VertexFetch.h b/lgc/include/lgc/patch/VertexFetch.h index 8f56728457..83d7b47314 100644 --- a/lgc/include/lgc/patch/VertexFetch.h +++ b/lgc/include/lgc/patch/VertexFetch.h @@ -43,7 +43,7 @@ class InputImportGenericOp; // Public interface to vertex fetch manager. class VertexFetch { public: - virtual ~VertexFetch() {} + virtual ~VertexFetch() = default; // Create a VertexFetch static VertexFetch *create(LgcContext *lgcContext, bool useSoftwareVertexBufferDescriptors, diff --git a/lgc/include/lgc/state/PipelineShaders.h b/lgc/include/lgc/state/PipelineShaders.h index 6c5fb59af7..7a9dfe0783 100644 --- a/lgc/include/lgc/state/PipelineShaders.h +++ b/lgc/include/lgc/state/PipelineShaders.h @@ -46,7 +46,7 @@ class PipelineShadersResult { std::optional getShaderStage(const llvm::Function *func) const; private: - llvm::Function *m_entryPoints[ShaderStage::CountInternal]; // The entry-point for each shader stage. + ShaderStageMap m_entryPoints; // The entry-point for each shader stage. std::map m_entryPointMap; // Map from shader entry-point to shader stage. }; diff --git a/lgc/include/lgc/state/PipelineState.h b/lgc/include/lgc/state/PipelineState.h index 0515968f17..b4375d5f85 100644 --- a/lgc/include/lgc/state/PipelineState.h +++ b/lgc/include/lgc/state/PipelineState.h @@ -254,6 +254,7 @@ class PipelineState final : public Pipeline { // Accessors for shader stage mask ShaderStageMask getShaderStageMask(); bool getPreRasterHasGs() const { return m_preRasterHasGs; } + bool hasShaderStage(ShaderStageEnum stage) { return getShaderStageMask().contains(stage); } bool isGraphics(); bool isComputeLibrary() const { return m_computeLibrary; } @@ -274,7 +275,7 @@ class PipelineState final : public Pipeline { llvm::ArrayRef getUserDataNodes() const { return m_userDataNodes; } // Find the push constant resource node - const ResourceNode *findPushConstantResourceNode(std::optional shaderStage = std::nullopt) const; + const ResourceNode *findPushConstantResourceNode(std::optional shaderStage) const; // Find the resource node for the given set,binding std::pair @@ -282,7 +283,8 @@ class PipelineState final : public Pipeline { std::optional shaderStage = std::nullopt) const; // Find the single root resource node of the given type - const ResourceNode *findSingleRootResourceNode(ResourceNodeType nodeType, ShaderStageEnum shaderStage) const; + const ResourceNode *findSingleRootResourceNode(ResourceNodeType nodeType, + std::optional shaderStage) const; // Accessors for vertex input descriptions. llvm::ArrayRef getVertexInputDescriptions() const { return m_vertexInputDescriptions; } @@ -302,7 +304,6 @@ class PipelineState final : public Pipeline { // Set GS on-chip mode void setGsOnChip(bool gsOnChip) { m_gsOnChip = gsOnChip; } - // Checks whether GS on-chip mode is enabled // NOTE: on GFX9, ES -> GS ring is always on-chip, GS on-chip mode means GS -> VS // ring is on-chip. @@ -449,7 +450,12 @@ class PipelineState final : public Pipeline { } // Get user data for a specific shader stage - llvm::ArrayRef getUserDataMap(ShaderStageEnum shaderStage) const { return m_userDataMaps[shaderStage]; } + llvm::ArrayRef getUserDataMap(ShaderStageEnum shaderStage) const { + auto it = m_userDataMaps.find(shaderStage); + if (it != m_userDataMaps.end()) + return it->second; + return {}; + } // Set spill_threshold for a specific shader stage void setSpillThreshold(ShaderStageEnum shaderStage, unsigned spillThreshold) { @@ -583,7 +589,7 @@ class PipelineState final : public Pipeline { llvm::ArrayRef getResourceTypeNames(); llvm::MDString *getResourceTypeName(ResourceNodeType type); ResourceNodeType getResourceTypeFromName(llvm::MDString *typeName); - bool matchResourceNode(const ResourceNode &node, ResourceNodeType nodeType, uint64_t descSet, unsigned binding) const; + bool matchResourceNode(const ResourceNode &node, uint64_t descSet, unsigned binding) const; // Device index handling void recordDeviceIndex(llvm::Module *module); @@ -613,7 +619,7 @@ class PipelineState final : public Pipeline { bool m_computeLibrary = false; // Whether pipeline is in fact a compute library std::string m_client; // Client name for PAL metadata Options m_options = {}; // Per-pipeline options - std::vector m_shaderOptions; // Per-shader options + ShaderStageMap m_shaderOptions; // Per-shader options std::unique_ptr m_allocUserDataNodes; // Allocated buffer for user data llvm::ArrayRef m_userDataNodes; // Top-level user data node table // Cached MDString for each resource node type @@ -621,27 +627,27 @@ class PipelineState final : public Pipeline { // Allocated buffers for immutable sampler data llvm::SmallVector, 4> m_immutableValueAllocs; - bool m_gsOnChip = false; // Whether to use GS on-chip mode - bool m_meshRowExport = false; // Enable mesh shader row export or not - NggControl m_nggControl = {}; // NGG control settings - ShaderModes m_shaderModes; // Shader modes for this pipeline - unsigned m_deviceIndex = 0; // Device index - std::vector m_vertexInputDescriptions; // Vertex input descriptions - llvm::SmallVector m_colorExportFormats; // Color export formats - ColorExportState m_colorExportState = {}; // Color export state - InputAssemblyState m_inputAssemblyState = {}; // Input-assembly state - RasterizerState m_rasterizerState = {}; // Rasterizer state - DepthStencilState m_depthStencilState = {}; // Depth/stencil state - std::unique_ptr m_resourceUsage[ShaderStage::Compute + 1] = {}; // Per-shader ResourceUsage - std::unique_ptr m_interfaceData[ShaderStage::Compute + 1] = {}; // Per-shader InterfaceData - PalMetadata *m_palMetadata = nullptr; // PAL metadata object - unsigned m_waveSize[ShaderStage::CountInternal] = {}; // Per-shader wave size - unsigned m_subgroupSize[ShaderStage::CountInternal] = {}; // Per-shader subgroup size - bool m_inputPackState[ShaderStage::GfxCount] = {}; // The input packable state per shader stage - bool m_outputPackState[ShaderStage::GfxCount] = {}; // The output packable state per shader stage - XfbStateMetadata m_xfbStateMetadata = {}; // Transform feedback state metadata - llvm::SmallVector m_userDataMaps[ShaderStage::CountInternal]; // The user data per-shader - unsigned m_shaderSpillThreshold[ShaderStage::CountInternal] = {}; // The spillThreshold per-shader + bool m_gsOnChip = false; // Whether to use GS on-chip mode + bool m_meshRowExport = false; // Enable mesh shader row export or not + NggControl m_nggControl = {}; // NGG control settings + ShaderModes m_shaderModes; // Shader modes for this pipeline + unsigned m_deviceIndex = 0; // Device index + std::vector m_vertexInputDescriptions; // Vertex input descriptions + llvm::SmallVector m_colorExportFormats; // Color export formats + ColorExportState m_colorExportState = {}; // Color export state + InputAssemblyState m_inputAssemblyState = {}; // Input-assembly state + RasterizerState m_rasterizerState = {}; // Rasterizer state + DepthStencilState m_depthStencilState = {}; // Depth/stencil state + ShaderStageMap> m_resourceUsage; // Per-shader ResourceUsage + ShaderStageMap> m_interfaceData; // Per-shader InterfaceData + PalMetadata *m_palMetadata = nullptr; // PAL metadata object + ShaderStageMap m_waveSize; // Per-shader wave size + ShaderStageMap m_subgroupSize; // Per-shader subgroup size + ShaderStageMap m_inputPackState; // The input packable state per shader stage + ShaderStageMap m_outputPackState; // The output packable state per shader stage + XfbStateMetadata m_xfbStateMetadata = {}; // Transform feedback state metadata + ShaderStageMap> m_userDataMaps; // The user data per-shader + unsigned m_shaderSpillThreshold[ShaderStage::CountInternal] = {}; // The spillThreshold per-shader struct { float inner[2]; // default tessellation inner level diff --git a/lgc/include/lgc/state/ResourceUsage.h b/lgc/include/lgc/state/ResourceUsage.h index 5f617e8a99..e8e946e3af 100644 --- a/lgc/include/lgc/state/ResourceUsage.h +++ b/lgc/include/lgc/state/ResourceUsage.h @@ -639,7 +639,7 @@ struct InterfaceData { }; bool initialized; // Whether entryArgIdxs has been initialized - // by PatchEntryPointMutate + // by MutateEntryPoint } entryArgIdxs = {}; InterfaceData(); diff --git a/lgc/include/lgc/state/ShaderModes.h b/lgc/include/lgc/state/ShaderModes.h index d297b35914..ecd1653d18 100644 --- a/lgc/include/lgc/state/ShaderModes.h +++ b/lgc/include/lgc/state/ShaderModes.h @@ -107,12 +107,12 @@ class ShaderModes { void readModesFromPipeline(llvm::Module *module); private: - CommonShaderMode m_commonShaderModes[ShaderStage::Compute + 1] = {}; // Per-shader FP modes - TessellationMode m_tessellationMode = {}; // Tessellation mode - GeometryShaderMode m_geometryShaderMode = {}; // Geometry shader mode - MeshShaderMode m_meshShaderMode = {}; // Mesh shader mode - FragmentShaderMode m_fragmentShaderMode = {}; // Fragment shader mode - ComputeShaderMode m_computeShaderMode = {}; // Compute shader mode (workgroup size) + ShaderStageMap m_commonShaderModes; // Per-shader FP mode + TessellationMode m_tessellationMode = {}; // Tessellation mode + GeometryShaderMode m_geometryShaderMode = {}; // Geometry shader mode + MeshShaderMode m_meshShaderMode = {}; // Mesh shader mode + FragmentShaderMode m_fragmentShaderMode = {}; // Fragment shader mode + ComputeShaderMode m_computeShaderMode = {}; // Compute shader mode (workgroup size) }; } // namespace lgc diff --git a/lgc/include/lgc/util/GfxRegHandler.h b/lgc/include/lgc/util/GfxRegHandler.h index 541a153d2c..03e0241de4 100644 --- a/lgc/include/lgc/util/GfxRegHandler.h +++ b/lgc/include/lgc/util/GfxRegHandler.h @@ -61,7 +61,7 @@ struct BitsState { // ===================================================================================================================== // Helper class for handling graphics registers. // Note: 1) Don't use GfxRegHandler directly, please implement your own register helper class, such as -// SqImgSampRegHelper +// SqImgSampRegHandler // 2) The ID (enum) used in this class is determined by BitsInfo // 3) The count of BisState used in this class is determined by BitsInfo // e.g. @@ -154,6 +154,9 @@ enum class SqRsrcRegs { WidthLo, WidthHi, ArrayPitch, + MinLod, + MinLodLo, + MinLodHi, Count, }; diff --git a/lgc/interface/lgc/Builder.h b/lgc/interface/lgc/Builder.h index f48bf0a7ad..fde589f64c 100644 --- a/lgc/interface/lgc/Builder.h +++ b/lgc/interface/lgc/Builder.h @@ -955,6 +955,14 @@ class Builder : public BuilderDefs { // @param instName : Name to give instruction(s) llvm::Value *CreateLoadPushConstantsPtr(const llvm::Twine &instName = ""); + // Merges a resource descriptor into a feedback descriptor to create a descriptor for sampler feedback instructions. + // + // @param feedbackDesc : feedback descriptor + // @param resourceDesc : resource descriptor + // @param instName : Name to give instruction(s) + llvm::Value *CreateSamplerFeedbackDesc(llvm::Value *feedbackDesc, llvm::Value *resourceDesc, + const llvm::Twine &instName = ""); + // ----------------------------------------------------------------------------------------------------------------- // Image operations diff --git a/lgc/interface/lgc/CommonDefs.h b/lgc/interface/lgc/CommonDefs.h index 4ae27eeabb..bc927c0859 100644 --- a/lgc/interface/lgc/CommonDefs.h +++ b/lgc/interface/lgc/CommonDefs.h @@ -235,6 +235,9 @@ enum class ResourceLayoutScheme : unsigned { Compact = 0, ///< Compact scheme make full use of all the user data registers. Indirect ///< Fixed layout, push constant will be the sub node of DescriptorTableVaPtr }; + +template +using ShaderStageMap = llvm::SmallDenseMap; } // namespace lgc namespace llvm { diff --git a/lgc/interface/lgc/ElfLinker.h b/lgc/interface/lgc/ElfLinker.h index e9114a17f2..d1b260c199 100644 --- a/lgc/interface/lgc/ElfLinker.h +++ b/lgc/interface/lgc/ElfLinker.h @@ -49,7 +49,7 @@ struct ColorExportInfo; // relocs. class ElfLinker { public: - virtual ~ElfLinker() {} + virtual ~ElfLinker() = default; // Add another input ELF to the link, in addition to the ones that were added when the ElfLinker was constructed. virtual void addInputElf(llvm::MemoryBufferRef inputElf) = 0; diff --git a/lgc/interface/lgc/LgcDialect.h b/lgc/interface/lgc/LgcDialect.h index 34b09b3a5b..facbaa2477 100644 --- a/lgc/interface/lgc/LgcDialect.h +++ b/lgc/interface/lgc/LgcDialect.h @@ -48,6 +48,8 @@ enum class CooperativeMatrixElementType : unsigned { Int32, // 32 bit integer Float16Packed, // packed 16-bit floating-point BFloat16, // 16-bit brain floating-point + Float8, // 8-bit floating-point + BFloat8, // 8-bit brain floating-point }; // Layout is virtual concept, eg: 16bit and 32bit for matrixC will share the same layout initially. diff --git a/lgc/interface/lgc/LgcDialect.td b/lgc/interface/lgc/LgcDialect.td index 8f904b2be7..eb4fb0f792 100644 --- a/lgc/interface/lgc/LgcDialect.td +++ b/lgc/interface/lgc/LgcDialect.td @@ -889,8 +889,8 @@ def CooperativeMatrixTimesScalarOp : LgcOp<"cooperative.matrix.times.scalar", [C def CooperativeMatrixMulAddOp : LgcOp<"cooperative.matrix.muladd", [Convergent, WillReturn]> { let arguments = (ins value:$matrix_a, value:$matrix_b, value:$matrix_c, AttrI1:$is_signed_a, AttrI1:$is_signed_b, - AttrI1:$is_sat_or_opsel, AttrI1:$is_tied, CooperativeMatrixElementType:$accu_elem_type, - CooperativeMatrixElementType:$factor_elem_type); + AttrI1:$is_sat_or_opsel, AttrI1:$is_tied, CooperativeMatrixElementType:$matrix_a_elem_type, + CooperativeMatrixElementType:$matrix_b_elem_type, CooperativeMatrixElementType:$matrix_c_elem_type); let results = (outs value:$result); let defaultBuilderHasExplicitResultType = true; @@ -913,8 +913,9 @@ def CooperativeMatrixMulAddOp : LgcOp<"cooperative.matrix.muladd", [Convergent, 'is_tied' is the flag of the output matrix has to be the same as the input accumulator (i.e., D has to be C) - 'accu_elem_type' is the component type of the accumulator matrix. - 'factor_elem_type' is the component type of the factor matrix. + '$matrix_a_elem_type' is the component type of the matrix A + '$matrix_b_elem_type' is the component type of the matrix B + '$matrix_c_elem_type' is the component type of the matrix C }]; } diff --git a/lgc/interface/lgc/PassManager.h b/lgc/interface/lgc/PassManager.h index fa068a321d..c91b866e38 100644 --- a/lgc/interface/lgc/PassManager.h +++ b/lgc/interface/lgc/PassManager.h @@ -48,7 +48,7 @@ class LgcContext; class LegacyPassManager : public llvm::legacy::PassManager { public: static LegacyPassManager *Create(); - virtual ~LegacyPassManager() {} + virtual ~LegacyPassManager() = default; virtual void stop() = 0; virtual void setPassIndex(unsigned *passIndex) = 0; }; diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h index 5f6e9ce437..bca1917eae 100644 --- a/lgc/interface/lgc/Pipeline.h +++ b/lgc/interface/lgc/Pipeline.h @@ -335,7 +335,7 @@ struct ResourceNode { ResourceNodeType concreteType; // Underlying actual type of this node ResourceNodeType abstractType; // Node type for resource node matching - unsigned visibility; // Visibility bitmap: bit N set means entry is visible to ShaderStageEnum(N); value 0 + ShaderStageMask visibility; // Visibility bitmap: entry is visible to the shader stages in the mask; empty mask // means visible to all shader stages unsigned sizeInDwords; // Size in dwords unsigned offsetInDwords; // Offset in dwords @@ -741,7 +741,7 @@ class Pipeline { public: Pipeline(LgcContext *builderContext) : m_builderContext(builderContext) {} - virtual ~Pipeline() {} + virtual ~Pipeline() = default; // Get LgcContext LgcContext *getLgcContext() const { return m_builderContext; } diff --git a/lgc/interface/lgc/RayTracingLibrarySummary.h b/lgc/interface/lgc/RayTracingLibrarySummary.h index 9c65fd32ba..6c56d7ecf9 100644 --- a/lgc/interface/lgc/RayTracingLibrarySummary.h +++ b/lgc/interface/lgc/RayTracingLibrarySummary.h @@ -37,6 +37,7 @@ */ #pragma once +#include "llvmraytracing/PipelineState.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Error.h" @@ -59,10 +60,6 @@ struct RayTracingLibrarySummary { // attributes (no AHS/IS/CHS). unsigned maxHitAttributeSize = 0; - // The maximum occurring number of payload registers in the pipeline, which will be taken into account for Traversal - // module so that it sees the correct maximum payload size of a pipeline. - unsigned maxUsedPayloadRegisterCount = 0; - // Whether a kernel entry function was built for this library. bool hasKernelEntry = false; @@ -76,6 +73,9 @@ struct RayTracingLibrarySummary { // flags). bool hasTraceRayModule = false; + // Opaque state owned by the llvmraytracing middle-end. + llvmraytracing::PipelineState llvmRaytracingState; + static llvm::Expected decodeMsgpack(llvm::StringRef data); std::string encodeMsgpack() const; diff --git a/lgc/patch/PatchLoopMetadata.cpp b/lgc/patch/AddLoopMetadata.cpp similarity index 99% rename from lgc/patch/PatchLoopMetadata.cpp rename to lgc/patch/AddLoopMetadata.cpp index e6af095cb8..d8d79d6b0c 100644 --- a/lgc/patch/PatchLoopMetadata.cpp +++ b/lgc/patch/AddLoopMetadata.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchLoopMetadata.cpp + * @file AddLoopMetadata.cpp * @brief LLPC source file: contains implementation of class lgc::PatchLoopMetadata. *********************************************************************************************************************** */ -#include "lgc/patch/PatchLoopMetadata.h" +#include "lgc/patch/AddLoopMetadata.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/Support/CommandLine.h" diff --git a/lgc/patch/PatchCheckShaderCache.cpp b/lgc/patch/CheckShaderCache.cpp similarity index 94% rename from lgc/patch/PatchCheckShaderCache.cpp rename to lgc/patch/CheckShaderCache.cpp index 538f02a4d0..b8cc064322 100644 --- a/lgc/patch/PatchCheckShaderCache.cpp +++ b/lgc/patch/CheckShaderCache.cpp @@ -24,16 +24,16 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchCheckShaderCache.cpp - * @brief LLPC source file: contains implementation of class lgc::PatchCheckShaderCache. + * @file CheckShaderCache.cpp + * @brief LLPC source file: contains implementation of class lgc::CheckShaderCache. *********************************************************************************************************************** */ -#include "lgc/patch/PatchCheckShaderCache.h" +#include "lgc/patch/CheckShaderCache.h" #include "lgc/CommonDefs.h" #include "lgc/state/PipelineShaders.h" #include "llvm/Support/Debug.h" -#define DEBUG_TYPE "lgc-patch-check-shader-cache" +#define DEBUG_TYPE "lgc-check-shader-cache" using namespace llvm; using namespace lgc; @@ -57,7 +57,7 @@ template static void streamMapEntries(MapType &map, raw_ostream } // namespace // ===================================================================================================================== -PatchCheckShaderCache::PatchCheckShaderCache(Pipeline::CheckShaderCacheFunc callbackFunc) +CheckShaderCache::CheckShaderCache(Pipeline::CheckShaderCacheFunc callbackFunc) : m_callbackFunc(std::move(callbackFunc)) { } @@ -67,10 +67,10 @@ PatchCheckShaderCache::PatchCheckShaderCache(Pipeline::CheckShaderCacheFunc call // @param [in/out] module : LLVM module to be run on // @param [in/out] analysisManager : Analysis manager to use for this transformation // @returns : The preserved analyses (The analyses that are still valid after this pass) -PreservedAnalyses PatchCheckShaderCache::run(Module &module, ModuleAnalysisManager &analysisManager) { +PreservedAnalyses CheckShaderCache::run(Module &module, ModuleAnalysisManager &analysisManager) { PipelineState *pipelineState = analysisManager.getResult(module).getPipelineState(); - LLVM_DEBUG(dbgs() << "Run the pass Patch-Check-Shader-Cache\n"); + LLVM_DEBUG(dbgs() << "Run the pass Check-Shader-Cache\n"); if (m_callbackFunc == nullptr) { // No shader cache in use. diff --git a/lgc/patch/PatchImageOpCollect.cpp b/lgc/patch/CollectImageOperations.cpp similarity index 97% rename from lgc/patch/PatchImageOpCollect.cpp rename to lgc/patch/CollectImageOperations.cpp index dd316cbb88..069c3fe5c2 100644 --- a/lgc/patch/PatchImageOpCollect.cpp +++ b/lgc/patch/CollectImageOperations.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchImageOpCollect.cpp + * @file CollectImageOperations.cpp * @brief LLPC source file: contains implementation of class lgc::PatchImageOpCollect. *********************************************************************************************************************** */ -#include "lgc/patch/PatchImageOpCollect.h" +#include "lgc/patch/CollectImageOperations.h" #include "lgc/patch/Patch.h" #include "lgc/state/PipelineState.h" #include "llvm/InitializePasses.h" diff --git a/lgc/patch/CombineCooperativeMatrix.cpp b/lgc/patch/CombineCooperativeMatrix.cpp index 1b32e9753b..f1977df851 100644 --- a/lgc/patch/CombineCooperativeMatrix.cpp +++ b/lgc/patch/CombineCooperativeMatrix.cpp @@ -118,7 +118,7 @@ bool CooperativeMatrixCombiner::run() { [](auto &self, auto &op) { self.m_ops.push_back(&op); }) .add([](auto &self, auto &op) { #if !defined(LLVM_MAIN_REVISION) || LLVM_MAIN_REVISION >= 479080 - auto accumElemType = op.getAccuElemType(); + auto accumElemType = op.getMatrixCElemType(); bool isPackable = accumElemType == CooperativeMatrixElementType::Float16; if ((self.m_gfxIpVersion.major == 11) && isPackable) { self.m_muladds[op.getParent()].push_back(&op); diff --git a/lgc/patch/ConfigBuilderBase.cpp b/lgc/patch/ConfigBuilderBase.cpp index 9302cc39df..0823ef161f 100644 --- a/lgc/patch/ConfigBuilderBase.cpp +++ b/lgc/patch/ConfigBuilderBase.cpp @@ -75,10 +75,6 @@ ConfigBuilderBase::ConfigBuilderBase(Module *module, PipelineState *pipelineStat setApiName(pipelineState->getClient()); } -// ===================================================================================================================== -ConfigBuilderBase::~ConfigBuilderBase() { -} - // ===================================================================================================================== /// Adds the .shaders.$(apiStage).hardware_mapping node to the PAL metadata. /// Also add .shader_subtype if it is a compute shader. diff --git a/lgc/patch/ConfigBuilderBase.h b/lgc/patch/ConfigBuilderBase.h index 31c6c57acb..873a036270 100644 --- a/lgc/patch/ConfigBuilderBase.h +++ b/lgc/patch/ConfigBuilderBase.h @@ -60,7 +60,6 @@ struct PalMetadataNoteEntry { class ConfigBuilderBase { public: ConfigBuilderBase(llvm::Module *module, PipelineState *pipelineState); - ~ConfigBuilderBase(); void writePalMetadata(); llvm::msgpack::MapDocNode &getGraphicsRegNode() { return m_graphicsRegistersNode; } @@ -130,7 +129,7 @@ class ConfigBuilderBase { llvm::msgpack::Document *m_document; // The MsgPack document llvm::msgpack::MapDocNode m_pipelineNode; // MsgPack map node for amdpal.pipelines[0] - llvm::DenseMap m_apiShaderNodes; + ShaderStageMap m_apiShaderNodes; // MsgPack map node for each API shader's node in // ".shaders" llvm::msgpack::MapDocNode m_hwShaderNodes[unsigned(Util::Abi::HardwareStage::Count)]; diff --git a/lgc/patch/FragColorExport.cpp b/lgc/patch/FragColorExport.cpp index c8459c1514..52a5797573 100644 --- a/lgc/patch/FragColorExport.cpp +++ b/lgc/patch/FragColorExport.cpp @@ -755,26 +755,6 @@ void LowerFragColorExport::collectExportInfoForBuiltinOutput(Function *module, B m_exportValues[MaxColorTargets] = output; } -// ===================================================================================================================== -// Generates a dummy export instruction. Returns last export instruction that was generated. -// -// @param builder : The builder object that will be used to create new instructions. -CallInst *FragColorExport::addDummyExport(BuilderBase &builder) { - auto zero = ConstantFP::get(builder.getFloatTy(), 0.0); - auto poison = PoisonValue::get(builder.getFloatTy()); - Value *args[] = { - builder.getInt32(EXP_TARGET_MRT_0), // tgt - builder.getInt32(0x1), // en - zero, // src0 - poison, // src1 - poison, // src2 - poison, // src3 - builder.getFalse(), // done - builder.getTrue() // vm - }; - return builder.CreateIntrinsic(Intrinsic::amdgcn_exp, builder.getFloatTy(), args); -} - // ===================================================================================================================== // Sets the done flag on the given export instruction. // @@ -1061,9 +1041,35 @@ void FragColorExport::generateExportInstructions(ArrayRef info, } } if (!lastExport && dummyExport) { - lastExport = FragColorExport::addDummyExport(builder); + // NOTE: We maybe should not set SPI_SHADER_COL_FORMAT to 0 because of observe corruptions in some games. + // For performance, we must set the CB_SHADER_MASK to non-zero for RB+ optimization. In this case, PAL re-sets + // SPI_SHADER_COL_FORMAT to 32R, maybe causing a mismatch with CB_SHADER_MASK, there seems to be no impact on + // performance. + // For correctness, we should enable all channels enabled via the export format and write 0. + auto zero = ConstantFP::get(builder.getFloatTy(), 0.0); + auto zeros = ConstantVector::get({zero, zero, zero, zero}); + const auto expFmt = key.dummyExpFmt == EXP_FORMAT_ZERO ? EXP_FORMAT_32_R : key.dummyExpFmt; + lastExport = handleColorExportInstructions(zeros, 0, builder, expFmt, false, false); palMetadata->setPsDummyExport(); - finalExportFormats.push_back(EXP_FORMAT_32_R); + finalExportFormats.push_back(expFmt); + switch (expFmt) { + case EXP_FORMAT_32_R: { + cbShaderMask = 0x1U; + break; + } + case EXP_FORMAT_32_GR: { + cbShaderMask = 0x3U; + break; + } + case EXP_FORMAT_32_AR: { + cbShaderMask = 0x9U; + break; + } + default: { + cbShaderMask = 0xFU; + break; + } + } } if (lastExport) FragColorExport::setDoneFlag(lastExport, builder); @@ -1154,6 +1160,8 @@ FragColorExport::Key FragColorExport::computeKey(ArrayRef infos key.enableFragColor = pipelineState->getOptions().enableFragColor; key.colorExportState = pipelineState->getColorExportState(); key.waveSize = pipelineState->getShaderWaveSize(ShaderStage::Fragment); + key.dummyExpFmt = static_cast( + pipelineState->computeExportFormat(Type::getFloatTy(pipelineState->getContext()), 0, false)); if (!infos.empty() && infos[0].hwColorTarget == MaxColorTargets) { infos = infos.drop_front(1); diff --git a/lgc/patch/PatchCopyShader.cpp b/lgc/patch/GenerateCopyShader.cpp similarity index 92% rename from lgc/patch/PatchCopyShader.cpp rename to lgc/patch/GenerateCopyShader.cpp index b822fb5e94..ad4c50a5a1 100644 --- a/lgc/patch/PatchCopyShader.cpp +++ b/lgc/patch/GenerateCopyShader.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchCopyShader.cpp - * @brief LLPC source file: contains declaration and implementation of class lgc::PatchCopyShader. + * @file GenerateCopyShader.cpp + * @brief LLPC source file: contains declaration and implementation of class lgc::GenerateCopyShader. *********************************************************************************************************************** */ -#include "lgc/patch/PatchCopyShader.h" +#include "lgc/patch/GenerateCopyShader.h" #include "lgc/state/IntrinsDefs.h" #include "lgc/state/PalMetadata.h" #include "lgc/state/PipelineShaders.h" @@ -43,7 +43,7 @@ #include "llvm/Support/Debug.h" #include -#define DEBUG_TYPE "lgc-patch-copy-shader" +#define DEBUG_TYPE "lgc-generate-copy-shader" using namespace lgc; using namespace llvm; @@ -54,11 +54,11 @@ using namespace llvm; // @param [in/out] module : LLVM module to be run on // @param [in/out] analysisManager : Analysis manager to use for this transformation // @returns : The preserved analyses (The analyses that are still valid after this pass) -PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &analysisManager) { +PreservedAnalyses GenerateCopyShader::run(Module &module, ModuleAnalysisManager &analysisManager) { PipelineState *pipelineState = analysisManager.getResult(module).getPipelineState(); PipelineShadersResult &pipelineShaders = analysisManager.getResult(module); - LLVM_DEBUG(dbgs() << "Run the pass Patch-Copy-Shader\n"); + LLVM_DEBUG(dbgs() << "Run the pass Generate-Copy-Shader\n"); Patch::init(&module); @@ -193,13 +193,17 @@ PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &an if (m_pipelineState->isGsOnChip()) m_lds = Patch::getLdsVariable(m_pipelineState, entryPoint); - unsigned outputStreamCount = 0; - for (int i = 0; i < MaxGsStreams; ++i) { + SmallVector activeStreams; + for (unsigned i = 0; i < MaxGsStreams; ++i) { if (m_pipelineState->isVertexStreamActive(i)) - outputStreamCount++; + activeStreams.push_back(i); } + assert(!activeStreams.empty()); + + if (activeStreams.size() > 1) { + // Multiple streams + assert(m_pipelineState->enableXfb() || m_pipelineState->enablePrimStats()); - if (outputStreamCount > 1 && m_pipelineState->enableXfb()) { if (!m_pipelineState->getNggControl()->enableNgg) { // StreamId = streamInfo[25:24] auto streamInfo = getFunctionArgument(entryPoint, CopyShaderEntryArgIdxStreamInfo); @@ -232,19 +236,17 @@ PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &an // // Add switchInst to entry block - auto switchInst = builder.CreateSwitch(streamId, endBlock, outputStreamCount); + auto switchInst = builder.CreateSwitch(streamId, endBlock, activeStreams.size()); - for (unsigned streamId = 0; streamId < MaxGsStreams; ++streamId) { - if (m_pipelineState->isVertexStreamActive(streamId)) { - std::string blockName = ".stream" + std::to_string(streamId); - BasicBlock *streamBlock = BasicBlock::Create(*m_context, blockName, entryPoint, endBlock); - builder.SetInsertPoint(streamBlock); + for (auto activeStream : activeStreams) { + std::string blockName = ".stream" + std::to_string(activeStream); + BasicBlock *streamBlock = BasicBlock::Create(*m_context, blockName, entryPoint, endBlock); + builder.SetInsertPoint(streamBlock); - switchInst->addCase(builder.getInt32(streamId), streamBlock); + switchInst->addCase(builder.getInt32(activeStream), streamBlock); - exportOutput(streamId, builder); - builder.CreateBr(endBlock); - } + exportOutput(activeStream, builder); + builder.CreateBr(endBlock); } } else { // NOTE: If NGG, the copy shader with stream-out is not a real HW VS and will be incorporated into NGG @@ -264,15 +266,13 @@ PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &an // assert(gfxIp.major >= 11); // Must be GFX11+ - for (unsigned streamId = 0; streamId < MaxGsStreams; ++streamId) { - if (m_pipelineState->isVertexStreamActive(streamId)) - exportOutput(streamId, builder); - } + for (auto activeStream : activeStreams) + exportOutput(activeStream, builder); builder.CreateBr(endBlock); } } else { - // Just export outputs of rasterization stream - exportOutput(m_pipelineState->getRasterizerState().rasterStream, builder); + // Single stream + exportOutput(activeStreams[0], builder); builder.CreateBr(endBlock); } @@ -283,7 +283,7 @@ PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &an // Collects info for GS generic outputs. // // @param gsEntryPoint : Geometry shader entrypoint -void PatchCopyShader::collectGsGenericOutputInfo(Function *gsEntryPoint) { +void GenerateCopyShader::collectGsGenericOutputInfo(Function *gsEntryPoint) { auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::CopyShader); const auto &outputLocInfoMap = resUsage->inOutUsage.outputLocInfoMap; std::set visitedLocInfos; @@ -351,7 +351,9 @@ void PatchCopyShader::collectGsGenericOutputInfo(Function *gsEntryPoint) { // // @param streamId : Export output of this stream // @param builder : BuilderBase to use for instruction constructing -void PatchCopyShader::exportOutput(unsigned streamId, BuilderBase &builder) { +void GenerateCopyShader::exportOutput(unsigned streamId, BuilderBase &builder) { + assert(streamId < MaxGsStreams); + auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::CopyShader); auto &builtInUsage = resUsage->builtInUsage.gs; auto &locInfoXfbOutInfoMap = resUsage->inOutUsage.locInfoXfbOutInfoMap; @@ -486,8 +488,8 @@ void PatchCopyShader::exportOutput(unsigned streamId, BuilderBase &builder) { // @param compIdx : Output component // @param streamId : Output stream ID // @param builder : BuilderBase to use for instruction constructing -Value *PatchCopyShader::calcGsVsRingOffsetForInput(unsigned location, unsigned compIdx, unsigned streamId, - BuilderBase &builder) { +Value *GenerateCopyShader::calcGsVsRingOffsetForInput(unsigned location, unsigned compIdx, unsigned streamId, + BuilderBase &builder) { auto entryPoint = builder.GetInsertBlock()->getParent(); Value *vertexOffset = getFunctionArgument(entryPoint, CopyShaderEntryArgIdxVertexOffset); @@ -518,8 +520,8 @@ Value *PatchCopyShader::calcGsVsRingOffsetForInput(unsigned location, unsigned c // @param component : Output component // @param streamId : Output stream ID // @param builder : BuilderBase to use for instruction constructing -Value *PatchCopyShader::loadValueFromGsVsRing(Type *loadTy, unsigned location, unsigned component, unsigned streamId, - BuilderBase &builder) { +Value *GenerateCopyShader::loadValueFromGsVsRing(Type *loadTy, unsigned location, unsigned component, unsigned streamId, + BuilderBase &builder) { auto entryPoint = builder.GetInsertBlock()->getParent(); unsigned elemCount = 1; @@ -601,7 +603,7 @@ Value *PatchCopyShader::loadValueFromGsVsRing(Type *loadTy, unsigned location, u // @param outputValue : Value exported to output // @param location : Location of the output // @param builder : BuilderBase to use for instruction constructing -void PatchCopyShader::exportGenericOutput(Value *outputValue, unsigned location, BuilderBase &builder) { +void GenerateCopyShader::exportGenericOutput(Value *outputValue, unsigned location, BuilderBase &builder) { auto outputTy = outputValue->getType(); assert(outputTy->isSingleValueType()); std::string instName(lgcName::OutputExportGeneric); @@ -615,7 +617,7 @@ void PatchCopyShader::exportGenericOutput(Value *outputValue, unsigned location, // @param outputValue : Value exported to output // @param xfbOutInfo : The reference to a transform feedback output info // @param builder : BuilderBase to use for instruction constructing -void PatchCopyShader::exportXfbOutput(Value *outputValue, const XfbOutInfo &xfbOutInfo, BuilderBase &builder) { +void GenerateCopyShader::exportXfbOutput(Value *outputValue, const XfbOutInfo &xfbOutInfo, BuilderBase &builder) { if (xfbOutInfo.is16bit) { // NOTE: For 16-bit transform feedback output, the value is 32-bit dword loaded from GS-VS ring // buffer. The high word is always zero while the low word contains the data value. We have to @@ -657,8 +659,8 @@ void PatchCopyShader::exportXfbOutput(Value *outputValue, const XfbOutInfo &xfbO // @param builtInId : ID of the built-in variable // @param streamId : ID of output vertex stream // @param builder : BuilderBase to use for instruction constructing -void PatchCopyShader::exportBuiltInOutput(Value *outputValue, BuiltInKind builtInId, unsigned streamId, - BuilderBase &builder) { +void GenerateCopyShader::exportBuiltInOutput(Value *outputValue, BuiltInKind builtInId, unsigned streamId, + BuilderBase &builder) { auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::CopyShader); if (m_pipelineState->enableXfb()) { diff --git a/lgc/patch/PatchLlvmIrInclusion.cpp b/lgc/patch/IncludeLlvmIr.cpp similarity index 97% rename from lgc/patch/PatchLlvmIrInclusion.cpp rename to lgc/patch/IncludeLlvmIr.cpp index 3ff1c7f387..9553f03706 100644 --- a/lgc/patch/PatchLlvmIrInclusion.cpp +++ b/lgc/patch/IncludeLlvmIr.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchLlvmIrInclusion.cpp + * @file IncludeLlvmIr.cpp * @brief LLPC source file: contains implementation of class lgc::PatchLlvmIrInclusion. *********************************************************************************************************************** */ -#include "lgc/patch/PatchLlvmIrInclusion.h" +#include "lgc/patch/IncludeLlvmIr.h" #include "lgc/state/Abi.h" #include "lgc/state/PipelineState.h" #include "llvm/IR/Constants.h" diff --git a/lgc/patch/LowerCooperativeMatrix.cpp b/lgc/patch/LowerCooperativeMatrix.cpp index ad7b274447..bec5cec5d7 100644 --- a/lgc/patch/LowerCooperativeMatrix.cpp +++ b/lgc/patch/LowerCooperativeMatrix.cpp @@ -48,6 +48,37 @@ using namespace lgc; namespace lgc { +static const Intrinsic::AMDGCNIntrinsics InvalidInstricID = Intrinsic::AMDGCNIntrinsics(0xFFFFFFFF); +static const Intrinsic::AMDGCNIntrinsics GetWmmaIntrinsic(GfxIpVersion gfxIp, CooperativeMatrixElementType typeA, + CooperativeMatrixElementType typeB, + CooperativeMatrixElementType typeC, bool isTiled = false) { + assert(gfxIp.major >= 11); + switch (typeA) { + case CooperativeMatrixElementType::Float16: { + assert(typeA == typeB); + if (typeC == CooperativeMatrixElementType::Float16) + return isTiled ? Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied : Intrinsic::amdgcn_wmma_f16_16x16x16_f16; + if (typeC == CooperativeMatrixElementType::Float32) + return Intrinsic::amdgcn_wmma_f32_16x16x16_f16; + } + case CooperativeMatrixElementType::BFloat16: { + assert(typeA == typeB); + if (typeC == CooperativeMatrixElementType::BFloat16) + return isTiled ? Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied : Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16; + if (typeC == CooperativeMatrixElementType::Float32) + return Intrinsic::amdgcn_wmma_f32_16x16x16_bf16; + } + case CooperativeMatrixElementType::Int8: { + if (typeC == CooperativeMatrixElementType::Int32) + return Intrinsic::amdgcn_wmma_i32_16x16x16_iu8; + } + default: + break; + } + + return InvalidInstricID; +} + // ===================================================================================================================== // Run the patch cooperative matrix pass on a module // @@ -130,6 +161,8 @@ LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties props.numMatrixWords = 8; break; case CooperativeMatrixElementType::Int8: + case CooperativeMatrixElementType::Float8: + case CooperativeMatrixElementType::BFloat8: props.numMatrixElements = 16; props.numMatrixWords = 4; break; @@ -137,7 +170,7 @@ LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties llvm_unreachable("unknown element type"); } - auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); + auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value()); if (layout == CooperativeMatrixLayout::FactorMatrixLayout) { assert(elemType != CooperativeMatrixElementType::Float32 && elemType != CooperativeMatrixElementType::Int32); props.numFlatElements = 16; @@ -273,7 +306,7 @@ LowerCooperativeMatrix::computeAddressing(CooperativeMatrixLayout layout, Cooper void LowerCooperativeMatrix::visitCooperativeMatrixLengthOp(CooperativeMatrixLengthOp &matrixlength) { BuilderBase builder(*m_context); builder.SetInsertPoint(&matrixlength); - auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); + auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value()); auto layout = matrixlength.getLayout(); unsigned length = 0; switch (layout) { @@ -466,7 +499,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixExtractOp(CooperativeMatrixEx // a specialization constant even though, at the time of specialization constant lowering, we don't yet know the // wave size. We should remove this once a healther KHR extension has been released. if (layout == CooperativeMatrixLayout::AccumulatorMatrixLayout && - m_pipelineState->getShaderWaveSize(m_shaderStage) == 64) { + m_pipelineState->getShaderWaveSize(m_shaderStage.value()) == 64) { unsigned length = cast(vec->getType())->getNumElements(); index = builder.CreateAnd(index, builder.getInt32(length - 1)); } @@ -495,7 +528,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixInsertOp(CooperativeMatrixIns // a specialization constant even though, at the time of specialization constant lowering, we don't yet know the // wave size. We should remove this once a healther KHR extension has been released. if (layout == CooperativeMatrixLayout::AccumulatorMatrixLayout && - m_pipelineState->getShaderWaveSize(m_shaderStage) == 64) { + m_pipelineState->getShaderWaveSize(m_shaderStage.value()) == 64) { unsigned length = cast(vec->getType())->getNumElements(); Value *outOfBounds = builder.CreateICmpUGE(index, builder.getInt32(length)); index = builder.CreateAnd(index, builder.getInt32(length - 1)); @@ -557,12 +590,11 @@ Value *LowerCooperativeMatrix::cooperativeMatrixConvertInternal(CastInst::CastOp resultValue = builder.CreateCast(Instruction::FPExt, source, FixedVectorType::get(builder.getFloatTy(), vecSize), "Convert16tofloat32"); resultValue = builder.CreateFPTrunc(resultValue, dstType); - } else { + } else resultValue = builder.CreateCast(castOp, source, dstType, "castOpConvert"); - } if (dstElemType == CooperativeMatrixElementType::BFloat16) { - resultValue = builder.CreateBitCast(resultValue, FixedVectorType::get(builder.getInt16Ty(), vecSize)); + return builder.CreateBitCast(resultValue, FixedVectorType::get(builder.getInt16Ty(), vecSize)); } return resultValue; @@ -731,7 +763,7 @@ Value *LowerCooperativeMatrix::cooperativeMatrixReshape16BitElementGfx1011( BuilderBase builder(*m_context); builder.SetInsertPoint(insertPos); Value *resultValue = nullptr; - auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); + auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value()); Value *laneGroupIdx = builder.CreateUDiv(threadId, builder.getInt32(16)); Value *isEvenGroup = builder.CreateICmpEQ(builder.CreateAnd(laneGroupIdx, builder.getInt32(1)), builder.getInt32(0)); @@ -919,7 +951,7 @@ Value *LowerCooperativeMatrix::cooperativeMatrixReshapeBetween8bitAnd32bitElemen BuilderBase builder(*m_context); builder.SetInsertPoint(insertPos); Value *resultValue = nullptr; - auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); + auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value()); Value *threadId = getLaneNumber(builder); Value *laneGroupIdx = builder.CreateUDiv(threadId, builder.getInt32(16)); Value *isEvenGroup = builder.CreateICmpEQ(builder.CreateAnd(laneGroupIdx, builder.getInt32(1)), builder.getInt32(0)); @@ -1386,50 +1418,53 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul Value *matrixA = muladd.getMatrixA(); Value *matrixB = muladd.getMatrixB(); Value *matrixC = muladd.getMatrixC(); - auto factorElemType = muladd.getFactorElemType(); - auto accumElemType = muladd.getAccuElemType(); + auto matrixAType = muladd.getMatrixAElemType(); + auto matrixBType = muladd.getMatrixBElemType(); + auto matrixCType = muladd.getMatrixCElemType(); bool isSignedA = muladd.getIsSignedA(); bool isSignedB = muladd.getIsSignedB(); bool isSatOrOpsel = muladd.getIsSatOrOpsel(); StringRef instName = muladd.getName(); + // Gfx11: + // wave64: + // declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>) + // declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>) + // declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) + // declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) + // declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 + // immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x + // i32>, i1 immarg) + // wave32: + // declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>) + // declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>) + // declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) + // declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) + // declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 + // immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x + // i32>, i1 immarg) + if (m_gfxIp.major >= 11) { - // Gfx11: - // wave64: - // declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>) - // declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>) - // declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) - // declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) - // declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 - // immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x - // i32>, i1 immarg) - // wave32: - // declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>) - // declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>) - // declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) - // declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) - // declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 - // immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x - // i32>, i1 immarg) Value *matrixD; - unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); + unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value()); - if (BuilderCommon::isTypeNCooperativeMatrix(factorElemType, 16)) { + if (BuilderCommon::isTypeNCooperativeMatrix(matrixAType, 16)) { + assert(matrixAType == matrixBType); unsigned factorFlatElemNum = 0; { factorFlatElemNum = 16; } Type *factorType = - FixedVectorType::get(builder.transCooperativeMatrixElementType(factorElemType), factorFlatElemNum); + FixedVectorType::get(builder.transCooperativeMatrixElementType(matrixAType), factorFlatElemNum); matrixA = builder.CreateBitCast(matrixA, factorType); matrixB = builder.CreateBitCast(matrixB, factorType); - } else if (BuilderCommon::isTypeNCooperativeMatrix(factorElemType, 8)) { + } else if (BuilderCommon::isTypeNCooperativeMatrix(matrixAType, 8)) { } else { llvm_unreachable("Factor element type is not supported!"); } - if (BuilderCommon::isTypeNCooperativeMatrix(accumElemType, 32)) { + if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 32)) { matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef({0, 1, 2, 3}), "shuffleVector") : matrixC; - } else if (BuilderCommon::isTypeNCooperativeMatrix(accumElemType, 16)) { + } else if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 16)) { { matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef({0, 1, 2, 3}), "shuffleVector") : matrixC; @@ -1437,7 +1472,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul unsigned matrixLength = cast(matrixC->getType())->getNumElements(); Type *castType = nullptr; - if (accumElemType == CooperativeMatrixElementType::BFloat16) { + if (matrixCType == CooperativeMatrixElementType::BFloat16) { // HW instructions require i16 type for bfloat16. castType = builder.getInt16Ty(); } else @@ -1448,51 +1483,44 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul llvm_unreachable("Accumulator element type is not supported!"); } - if (factorElemType == CooperativeMatrixElementType::BFloat16) { - Intrinsic::AMDGCNIntrinsics intrinsic = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16; - SmallVector args({matrixA, matrixB, matrixC}); - if (accumElemType == CooperativeMatrixElementType::Float32) - intrinsic = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16; - else { - assert(accumElemType == CooperativeMatrixElementType::BFloat16); - args.push_back(builder.getInt1(isSatOrOpsel)); - if (muladd.getIsTied()) - intrinsic = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied; - } - matrixD = builder.CreateIntrinsic(matrixC->getType(), intrinsic, args, nullptr, instName); - } else if (factorElemType == CooperativeMatrixElementType::Float16 && - accumElemType == CooperativeMatrixElementType::Float32) { - matrixD = builder.CreateIntrinsic(matrixC->getType(), Intrinsic::amdgcn_wmma_f32_16x16x16_f16, - {matrixA, matrixB, matrixC}, nullptr, instName); - - } else if (factorElemType == CooperativeMatrixElementType::Int8 && - accumElemType == CooperativeMatrixElementType::Int32) { - matrixD = builder.CreateIntrinsic(matrixC->getType(), Intrinsic::amdgcn_wmma_i32_16x16x16_iu8, - {builder.getInt1(isSignedA), matrixA, builder.getInt1(isSignedB), matrixB, - matrixC, builder.getInt1(isSatOrOpsel)}, - nullptr, instName); - - } else if (factorElemType == CooperativeMatrixElementType::Float16 && - accumElemType == CooperativeMatrixElementType::Float16) { - // Matrix convert to match intrinsic arguments: Wave32: float32*v8->half*v16 - // Wave64: float32*v4->half*v8 - bool isTied = muladd.getIsTied(); - auto intrinsic = Intrinsic::amdgcn_wmma_f16_16x16x16_f16; - if (isTied) -#if defined(LLVM_MAIN_REVISION) && LLVM_MAIN_REVISION < 479080 - llvm_unreachable("Tied intrinsics not implemented"); -#else - intrinsic = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied; -#endif - matrixD = builder.CreateIntrinsic(matrixC->getType(), intrinsic, - {matrixA, matrixB, matrixC, builder.getInt1(isSatOrOpsel)}, nullptr, instName); - } else { - llvm_unreachable("The accumulator type is not supported."); + auto intrinsic = GetWmmaIntrinsic(m_gfxIp, matrixAType, matrixBType, matrixCType, muladd.getIsTied()); + if (intrinsic == InvalidInstricID) + llvm_unreachable("HW intrinsics not supported!"); + + SmallVector args; + switch (intrinsic) { + case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: + args.push_back(matrixA); + args.push_back(matrixB); + args.push_back(matrixC); + break; + case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: + case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied: + case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: + case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: + args.push_back(matrixA); + args.push_back(matrixB); + args.push_back(matrixC); + args.push_back(builder.getInt1(isSatOrOpsel)); + break; + case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: + args.push_back(builder.getInt1(isSignedA)); + args.push_back(matrixA); + args.push_back(builder.getInt1(isSignedB)); + args.push_back(matrixB); + args.push_back(matrixC); + args.push_back(builder.getInt1(isSatOrOpsel)); + break; + default: + llvm_unreachable("Should never be called!"); + break; } + matrixD = builder.CreateIntrinsic(matrixC->getType(), intrinsic, args, nullptr, instName); - if (BuilderCommon::isTypeNCooperativeMatrix(accumElemType, 16)) { + if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 16)) { unsigned coopVeclength = cast(matrixD->getType())->getNumElements(); - Type *wordTy = builder.transCooperativeMatrixElementType(accumElemType)->isIntOrIntVectorTy() + Type *wordTy = builder.transCooperativeMatrixElementType(matrixCType)->isIntOrIntVectorTy() ? builder.getInt32Ty() : builder.getFloatTy(); matrixD = builder.CreateBitCast(matrixD, FixedVectorType::get(wordTy, coopVeclength / 2)); @@ -1512,8 +1540,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul } // Emulator on NAVI2X - Type *packedTy = - (factorElemType == CooperativeMatrixElementType::Float16) ? builder.getFloatTy() : builder.getInt32Ty(); + Type *packedTy = (matrixAType == CooperativeMatrixElementType::Float16) ? builder.getFloatTy() : builder.getInt32Ty(); Value *dotProductValue; Value *threadId = getLaneNumber(builder); @@ -1529,7 +1556,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul }; // matrixC is not reshaped for gfx10 - if (accumElemType == CooperativeMatrixElementType::Float32 || accumElemType == CooperativeMatrixElementType::Int32) { + if (matrixCType == CooperativeMatrixElementType::Float32 || matrixCType == CooperativeMatrixElementType::Int32) { dotProductValue = PoisonValue::get(FixedVectorType::get(packedTy, 8)); for (unsigned idxc = 0; idxc < 8; ++idxc) { Value *rowlowgroup = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc * 2)); @@ -1537,25 +1564,24 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul Value *rowData = builder.CreateSelect(isEvenGroup, rowlowgroup, rowhighgroup); Value *mulAB; Value *initAccumulator = builder.CreateExtractElement(matrixC, idxc); - if (factorElemType == CooperativeMatrixElementType::Float16) { + if (matrixAType == CooperativeMatrixElementType::Float16) { mulAB = createDotProductFp16Fp32(rowData, matrixB, initAccumulator, isSatOrOpsel, instName, &muladd); - } else if (factorElemType == CooperativeMatrixElementType::Int16) { + } else if (matrixAType == CooperativeMatrixElementType::Int16) { mulAB = createDotProductInt16Int32(rowData, matrixB, initAccumulator, flags, isSatOrOpsel, instName, &muladd); - } else if (factorElemType == CooperativeMatrixElementType::Int8) { + } else if (matrixAType == CooperativeMatrixElementType::Int8) { mulAB = createDotProductInt8Int32(rowData, matrixB, initAccumulator, flags, isSatOrOpsel, instName, &muladd); } else { llvm_unreachable("Unsupported element type!"); } dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB, idxc); } - } else if (accumElemType == CooperativeMatrixElementType::Int16 || - accumElemType == CooperativeMatrixElementType::Float16) { - dotProductValue = - PoisonValue::get(FixedVectorType::get(builder.transCooperativeMatrixElementType(accumElemType), 8)); + } else if (matrixCType == CooperativeMatrixElementType::Int16 || + matrixCType == CooperativeMatrixElementType::Float16) { + dotProductValue = PoisonValue::get(FixedVectorType::get(builder.transCooperativeMatrixElementType(matrixCType), 8)); // For gfx10, A*B:8*float32->16*half C: no reshape for 16bit, still 16*half Value *colData = - convCoopMatrixVecToFlatVec(builder, matrixB, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout); - matrixC = convCoopMatrixVecToFlatVec(builder, matrixC, accumElemType, + convCoopMatrixVecToFlatVec(builder, matrixB, matrixAType, CooperativeMatrixLayout::FactorMatrixLayout); + matrixC = convCoopMatrixVecToFlatVec(builder, matrixC, matrixCType, CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout); for (unsigned idxc = 0, accIdx = 0; idxc < 16; idxc += 4, accIdx += 2) { @@ -1568,16 +1594,16 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul Value *rowData2 = builder.CreateSelect(isEvenGroup, rowData2Low, rowData2High); rowData1 = - convCoopMatrixVecToFlatVec(builder, rowData1, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout); + convCoopMatrixVecToFlatVec(builder, rowData1, matrixAType, CooperativeMatrixLayout::FactorMatrixLayout); rowData2 = - convCoopMatrixVecToFlatVec(builder, rowData2, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout); + convCoopMatrixVecToFlatVec(builder, rowData2, matrixAType, CooperativeMatrixLayout::FactorMatrixLayout); Value *mulAB1; Value *mulAB2; Value *accumulator1 = builder.CreateExtractElement(matrixC, accIdx); Value *accumulator2 = builder.CreateExtractElement(matrixC, accIdx + 1); - if (accumElemType == CooperativeMatrixElementType::Float16) { + if (matrixCType == CooperativeMatrixElementType::Float16) { mulAB1 = createDotProductFp16Fp16(rowData1, colData, accumulator1, isSatOrOpsel, instName, &muladd); mulAB2 = createDotProductFp16Fp16(rowData2, colData, accumulator2, isSatOrOpsel, instName, &muladd); } else { @@ -1588,7 +1614,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB2, accIdx + 1); } - dotProductValue = convFlatVecToCoopMatrixVec(builder, dotProductValue, accumElemType, + dotProductValue = convFlatVecToCoopMatrixVec(builder, dotProductValue, matrixCType, CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout); } else { llvm_unreachable("The accumulator type is not supported."); @@ -1884,7 +1910,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixUnPackOp(CooperativeMatrixUnP // @param builder : The IR builder to create and insert IR instruction Value *LowerCooperativeMatrix::getLaneNumber(BuilderBase &builder) { Value *result = builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {builder.getInt32(-1), builder.getInt32(0)}); - if (m_pipelineState->getShaderWaveSize(m_shaderStage) == 64) + if (m_pipelineState->getShaderWaveSize(m_shaderStage.value()) == 64) result = builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {builder.getInt32(-1), result}); return result; } @@ -2082,7 +2108,7 @@ void LowerCooperativeMatrix::visitCooperativeRowAccExpandOp(CooperativeRowAccExp {mappedArgs[0], passthroughArgs[0], passthroughArgs[1], passthroughArgs[2], passthroughArgs[3]}); }; - auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); + auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value()); assert(waveSize == 32 || waveSize == 64); DppCtrl shuffleCtrl[4] = {DppCtrl(UINT32_MAX), DppCtrl(UINT32_MAX), DppCtrl(UINT32_MAX), DppCtrl(UINT32_MAX)}; diff --git a/lgc/patch/LowerDebugPrintf.cpp b/lgc/patch/LowerDebugPrintf.cpp index 98bfe7a2d4..8f9e0d7e97 100644 --- a/lgc/patch/LowerDebugPrintf.cpp +++ b/lgc/patch/LowerDebugPrintf.cpp @@ -75,8 +75,10 @@ PreservedAnalyses LowerDebugPrintf::run(Module &module, ModuleAnalysisManager &a return PreservedAnalyses::all(); const ResourceNode *node = nullptr; - std::tie(m_topNode, node) = pipelineState->findResourceNode(ResourceNodeType::DescriptorBuffer, - InternalDescriptorSetId, PrintfBufferBindingId); + // LLpc node type is DescriptorBuffer + // So use ResourceNodeType::Unknown to match different node type. + std::tie(m_topNode, node) = + pipelineState->findResourceNode(ResourceNodeType::Unknown, InternalDescriptorSetId, PrintfBufferBindingId); static const auto lowerDebugfPrintOpVisitor = llvm_dialects::VisitorBuilder() .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) diff --git a/lgc/patch/LowerGpuRt.cpp b/lgc/patch/LowerGpuRt.cpp index 33a65e3fbd..fcc2eff4c3 100644 --- a/lgc/patch/LowerGpuRt.cpp +++ b/lgc/patch/LowerGpuRt.cpp @@ -59,8 +59,7 @@ PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysi Builder builderImpl(pipelineState->getContext()); m_builder = &builderImpl; - - createGlobalStack(module); + createLdsStack(module); static auto visitor = llvm_dialects::VisitorBuilder() .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) @@ -104,20 +103,25 @@ PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysi // ===================================================================================================================== // Get pipeline workgroup size for stack size calculation -unsigned LowerGpuRt::getWorkgroupSize() const { +// @param [in] Function : The function to retrieve shader information +unsigned LowerGpuRt::getWorkgroupSize(Function *func) const { unsigned workgroupSize = 0; - if (m_pipelineState->isGraphics()) { - // Force 64 for graphics stages - workgroupSize = 64; - } else { + auto stage = getShaderStage(func); + const unsigned waveSize = m_pipelineState->getShaderWaveSize(stage.value()); + if (stage == ShaderStage::Mesh) { + auto &meshMode = m_pipelineState->getShaderModes()->getMeshShaderMode(); + workgroupSize = meshMode.workgroupSizeX * meshMode.workgroupSizeY * meshMode.workgroupSizeZ; + } else if (stage == ShaderStage::Task || stage == ShaderStage::Compute) { ComputeShaderMode mode = m_pipelineState->getShaderModes()->getComputeShaderMode(); workgroupSize = mode.workgroupSizeX * mode.workgroupSizeY * mode.workgroupSizeZ; + } else { + assert(m_pipelineState->isGraphics()); + workgroupSize = 64; } assert(workgroupSize != 0); - if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) { - // Round up to multiple of 32, as the ds_bvh_stack swizzle as 32 threads - workgroupSize = alignTo(workgroupSize, 32); - } + + workgroupSize = alignTo(workgroupSize, waveSize); + return workgroupSize; } @@ -136,33 +140,47 @@ Value *LowerGpuRt::getThreadIdInGroup() const { } // ===================================================================================================================== -// Create global variable for the stack +// Update the workgroup size from different functions +// @param func : Function to get WorkgroupSize from +void LowerGpuRt::updateWorkgroupSize(Function *func) { + unsigned funcWorkSize = getWorkgroupSize(func); + m_workGroupSize = m_workGroupSize > funcWorkSize ? m_workGroupSize : funcWorkSize; +} + +// ===================================================================================================================== +// Create global variable for the lds stack // @param [in/out] module : LLVM module to be run on -void LowerGpuRt::createGlobalStack(Module &module) { +void LowerGpuRt::createLdsStack(Module &module) { struct Payload { - bool needGlobalStack; + bool needLdsStack; bool needExtraStack; + LowerGpuRt *lowerRt; }; - Payload payload = {false, false}; + Payload payload = {false, false, this}; + m_workGroupSize = 0; static auto visitor = llvm_dialects::VisitorBuilder() .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) .add([](auto &payload, auto &op) { - payload.needGlobalStack = true; + payload.needLdsStack = true; payload.needExtraStack |= op.getUseExtraStack(); + payload.lowerRt->updateWorkgroupSize(op.getFunction()); }) .add([](auto &payload, auto &op) { - payload.needGlobalStack = true; + payload.needLdsStack = true; payload.needExtraStack |= op.getUseExtraStack(); + payload.lowerRt->updateWorkgroupSize(op.getFunction()); }) .add([](auto &payload, auto &op) { - payload.needGlobalStack = true; + payload.needLdsStack = true; payload.needExtraStack |= op.getUseExtraStack(); + payload.lowerRt->updateWorkgroupSize(op.getFunction()); }) .build(); visitor.visit(payload, module); - if (payload.needGlobalStack) { - auto ldsStackSize = getWorkgroupSize() * MaxLdsStackEntries; + if (payload.needLdsStack) { + assert(m_workGroupSize > 0); + auto ldsStackSize = m_workGroupSize * MaxLdsStackEntries; // Double LDS size when any operations requires to perform on extra stack. if (payload.needExtraStack) ldsStackSize = ldsStackSize << 1; @@ -183,7 +201,7 @@ void LowerGpuRt::createGlobalStack(Module &module) { void LowerGpuRt::visitGetStackSize(GpurtGetStackSizeOp &inst) { m_builder->SetInsertPoint(&inst); Value *size = nullptr; - size = m_builder->getInt32(MaxLdsStackEntries * getWorkgroupSize()); + size = m_builder->getInt32(MaxLdsStackEntries * m_workGroupSize); inst.replaceAllUsesWith(size); m_callsToLower.push_back(&inst); m_funcsToLower.insert(inst.getCalledFunction()); @@ -207,7 +225,7 @@ void LowerGpuRt::visitGetStackBase(GpurtGetStackBaseOp &inst) { // @param inst : The dialect instruction to process void LowerGpuRt::visitGetStackStride(GpurtGetStackStrideOp &inst) { m_builder->SetInsertPoint(&inst); - Value *stride = m_builder->getInt32(getWorkgroupSize()); + Value *stride = m_builder->getInt32(m_workGroupSize); inst.replaceAllUsesWith(stride); m_callsToLower.push_back(&inst); m_funcsToLower.insert(inst.getCalledFunction()); @@ -222,7 +240,7 @@ void LowerGpuRt::visitStackRead(GpurtStackReadOp &inst) { Value *stackIndex = inst.getIndex(); Type *stackTy = PointerType::get(m_builder->getInt32Ty(), 3); if (inst.getUseExtraStack()) { - auto ldsStackSize = m_builder->getInt32(getWorkgroupSize() * MaxLdsStackEntries); + auto ldsStackSize = m_builder->getInt32(m_workGroupSize * MaxLdsStackEntries); stackIndex = m_builder->CreateAdd(stackIndex, ldsStackSize); } @@ -244,7 +262,7 @@ void LowerGpuRt::visitStackWrite(GpurtStackWriteOp &inst) { Value *stackData = inst.getValue(); Type *stackTy = PointerType::get(m_builder->getInt32Ty(), 3); if (inst.getUseExtraStack()) { - auto ldsStackSize = m_builder->getInt32(getWorkgroupSize() * MaxLdsStackEntries); + auto ldsStackSize = m_builder->getInt32(m_workGroupSize * MaxLdsStackEntries); stackIndex = m_builder->CreateAdd(stackIndex, ldsStackSize); } @@ -266,7 +284,7 @@ void LowerGpuRt::visitLdsStackInit(GpurtLdsStackInitOp &inst) { // From Navi3x on, Hardware has decided that the stacks are only swizzled across every 32 threads, // with stacks for every set of 32 threads stored after all the stack data for the previous 32 threads. - if (getWorkgroupSize() > 32) { + if (m_workGroupSize > 32) { // localThreadId = (LinearLocalThreadID%32) // localGroupId = (LinearLocalThreadID/32) // stackSize = STACK_SIZE * 32 = m_stackEntries * 32 @@ -281,7 +299,7 @@ void LowerGpuRt::visitLdsStackInit(GpurtLdsStackInitOp &inst) { } if (inst.getUseExtraStack()) { - auto ldsStackSize = m_builder->getInt32(getWorkgroupSize() * MaxLdsStackEntries); + auto ldsStackSize = m_builder->getInt32(m_workGroupSize * MaxLdsStackEntries); stackBasePerThread = m_builder->CreateAdd(stackBasePerThread, ldsStackSize); } diff --git a/lgc/patch/PatchImageDerivatives.cpp b/lgc/patch/LowerImageDerivatives.cpp similarity index 95% rename from lgc/patch/PatchImageDerivatives.cpp rename to lgc/patch/LowerImageDerivatives.cpp index e7798d7acd..933b178113 100644 --- a/lgc/patch/PatchImageDerivatives.cpp +++ b/lgc/patch/LowerImageDerivatives.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchImageDerivatives.cpp - * @brief LLPC source file: contains implementation of class lgc::PatchImageDerivatives. + * @file LowerImageDerivatives.cpp + * @brief LLPC source file: contains implementation of class lgc::LowerImageDerivatives. *********************************************************************************************************************** */ -#include "lgc/patch/PatchImageDerivatives.h" +#include "lgc/patch/LowerImageDerivatives.h" #include "lgc/patch/Patch.h" #include "lgc/state/PipelineState.h" #include "llvm/ADT/SmallSet.h" @@ -37,7 +37,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" -#define DEBUG_TYPE "lgc-patch-image-derivatives" +#define DEBUG_TYPE "lgc-lower-image-derivatives" using namespace llvm; using namespace lgc; @@ -58,10 +58,10 @@ static bool usesImplicitDerivatives(StringRef name) { // @param [in/out] module : LLVM module to be run on // @param [in/out] analysisManager : Analysis manager to use for this transformation // @returns : The preserved analyses (The analyses that are still valid after this pass) -PreservedAnalyses PatchImageDerivatives::run(Module &module, ModuleAnalysisManager &analysisManager) { +PreservedAnalyses LowerImageDerivatives::run(Module &module, ModuleAnalysisManager &analysisManager) { PipelineState *pipelineState = analysisManager.getResult(module).getPipelineState(); - LLVM_DEBUG(dbgs() << "Run the pass Patch-Image-Derivatives\n"); + LLVM_DEBUG(dbgs() << "Run the pass Lower-Image-Derivatives\n"); if (!pipelineState->hasShaderStage(ShaderStage::Fragment)) return PreservedAnalyses::all(); diff --git a/lgc/patch/PatchInOutImportExport.cpp b/lgc/patch/LowerInOut.cpp similarity index 98% rename from lgc/patch/PatchInOutImportExport.cpp rename to lgc/patch/LowerInOut.cpp index 2040857824..fdff15e302 100644 --- a/lgc/patch/PatchInOutImportExport.cpp +++ b/lgc/patch/LowerInOut.cpp @@ -24,12 +24,12 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchInOutImportExport.cpp + * @file LowerInOut.cpp * @brief LLPC source file: contains implementation of class lgc::PatchInOutImportExport. * *********************************************************************************************************************** */ -#include "lgc/patch/PatchInOutImportExport.h" +#include "lgc/patch/LowerInOut.h" #include "lgc/Builder.h" #include "lgc/BuiltIns.h" #include "lgc/LgcDialect.h" @@ -141,11 +141,10 @@ PreservedAnalyses PatchInOutImportExport::run(Module &module, ModuleAnalysisMana // Process each shader in turn, in reverse order (because for example VS uses inOutUsage.tcs.calcFactor // set by TCS). - for (int shaderStage = ShaderStage::CountInternal - 1; shaderStage >= 0; --shaderStage) { - auto entryPoint = pipelineShaders.getEntryPoint(static_cast(shaderStage)); + for (auto stage : llvm::reverse(ShaderStagesNativeCopy)) { + auto entryPoint = pipelineShaders.getEntryPoint(stage); if (entryPoint) { - processFunction(*entryPoint, static_cast(shaderStage), inputCallees, otherCallees, - getPostDominatorTree); + processFunction(*entryPoint, stage, inputCallees, otherCallees, getPostDominatorTree); } } @@ -246,8 +245,8 @@ void PatchInOutImportExport::markExportDone(Function *func, PostDominatorTree &p // Process a single shader void PatchInOutImportExport::processShader() { // Initialize the output value for gl_PrimitiveID - const auto &builtInUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->builtInUsage; - const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs; + const auto &builtInUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->builtInUsage; + const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs; if (m_shaderStage == ShaderStage::Vertex) { if (builtInUsage.vs.primitiveId) m_primitiveId = getFunctionArgument(m_entryPoint, entryArgIdxs.vs.primitiveId); @@ -453,7 +452,7 @@ void PatchInOutImportExport::processShader() { unsigned workgroupSizeX = mode.workgroupSizeX; unsigned workgroupSizeY = mode.workgroupSizeY; unsigned workgroupSizeZ = mode.workgroupSizeZ; - SwizzleWorkgroupLayout layout = calculateWorkgroupLayout(m_pipelineState, m_shaderStage); + SwizzleWorkgroupLayout layout = calculateWorkgroupLayout(m_pipelineState, m_shaderStage.value()); while (!func.use_empty()) { CallInst *reconfigCall = cast(*func.user_begin()); Value *localInvocationId = reconfigCall->getArgOperand(0); @@ -463,7 +462,7 @@ void PatchInOutImportExport::processShader() { (layout.macroLayout == WorkgroupLayout::SexagintiQuads)) { BuilderBase builder(reconfigCall); localInvocationId = reconfigWorkgroupLayout( - localInvocationId, m_pipelineState, m_shaderStage, layout.macroLayout, layout.microLayout, + localInvocationId, m_pipelineState, m_shaderStage.value(), layout.macroLayout, layout.microLayout, workgroupSizeX, workgroupSizeY, workgroupSizeZ, isHwLocalInvocationId, builder); } } @@ -514,7 +513,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { BuilderBase builder(*m_context); builder.SetInsertPoint(&callInst); - auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage); + auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value()); auto mangledName = callee->getName(); @@ -556,7 +555,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { LLVM_DEBUG(dbgs() << "Find input import call: builtin = " << builtInId << "\n"); - switch (m_shaderStage) { + switch (m_shaderStage.value()) { case ShaderStage::Vertex: // Nothing to do break; @@ -638,7 +637,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { origLocInfo.setLocation(origLoc); if (m_shaderStage == ShaderStage::TessEval || (m_shaderStage == ShaderStage::Fragment && - (m_pipelineState->getPrevShaderStage(m_shaderStage) == ShaderStage::Mesh || + (m_pipelineState->getPrevShaderStage(m_shaderStage.value()) == ShaderStage::Mesh || m_pipelineState->isUnlinked()))) { // NOTE: For generic inputs of tessellation evaluation shader or fragment shader whose previous shader stage // is mesh shader or is in unlinked pipeline, they could be per-patch ones or per-primitive ones. @@ -664,7 +663,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { } } } else { - if (m_pipelineState->canPackInput(m_shaderStage)) { + if (m_pipelineState->canPackInput(m_shaderStage.value())) { // The inputLocInfoMap of {TCS, GS, FS} maps original InOutLocationInfo to tightly compact InOutLocationInfo const bool isTcs = m_shaderStage == ShaderStage::TessControl; (void)isTcs; @@ -700,7 +699,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { elemIdx = genericLocationOp.getElemIdx(); assert(isDontCareValue(elemIdx) == false); - switch (m_shaderStage) { + switch (m_shaderStage.value()) { case ShaderStage::TessControl: { auto &inputOp = cast(genericLocationOp); auto vertexIdx = inputOp.getArrayIndex(); @@ -833,7 +832,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { unsigned streamId = cast(callInst.getOperand(2))->getZExtValue(); // NOTE: Transform feedback output will be done in last vertex-processing shader stage. - switch (m_shaderStage) { + switch (m_shaderStage.value()) { case ShaderStage::Vertex: { // No TS/GS pipeline, VS is the last stage if (!m_hasGs && !m_hasTs) @@ -863,7 +862,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { } else if (isBuiltInOutputExport) { const unsigned builtInId = value; - switch (m_shaderStage) { + switch (m_shaderStage.value()) { case ShaderStage::Vertex: { patchVsBuiltInOutputExport(output, builtInId, builder); break; @@ -881,7 +880,8 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { break; } case ShaderStage::Geometry: { - patchGsBuiltInOutputExport(output, builtInId, m_pipelineState->getRasterizerState().rasterStream, builder); + const unsigned streamId = cast(callInst.getOperand(1))->getZExtValue(); + patchGsBuiltInOutputExport(output, builtInId, streamId, builder); break; } case ShaderStage::Mesh: { @@ -967,7 +967,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { origLocInfo.setComponent(component); auto locInfoMapIt = resUsage->inOutUsage.outputLocInfoMap.find(origLocInfo); - if (m_pipelineState->canPackOutput(m_shaderStage)) { + if (m_pipelineState->canPackOutput(m_shaderStage.value())) { if (locInfoMapIt != resUsage->inOutUsage.outputLocInfoMap.end()) { loc = locInfoMapIt->second.getLocation(); elemIdx = locInfoMapIt->second.getComponent(); @@ -985,7 +985,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { // NOTE: Some outputs are not used by next shader stage. They must have been removed already. assert(loc != InvalidValue); - switch (m_shaderStage) { + switch (m_shaderStage.value()) { case ShaderStage::Vertex: { assert(callInst.arg_size() == 3); if (elemIdx == InvalidValue) @@ -1085,8 +1085,23 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { if (m_pipelineState->getShaderModes()->getGeometryShaderMode().robustGsEmits) { auto totalEmitCounterPtr = m_pipelineSysValues.get(m_entryPoint)->getTotalEmitCounterPtr(); Value *totalEmitCounter = builder.CreateLoad(builder.getInt32Ty(), totalEmitCounterPtr); + + // totalEmitCounter++ totalEmitCounter = builder.CreateAdd(totalEmitCounter, builder.getInt32(1)); builder.CreateStore(totalEmitCounter, totalEmitCounterPtr); + + if (m_gfxIp.major < 11) { + // NOTE: For pre-GFX11, the counters of primitives written are driven by the message GS_EMIT/GS_CUT. + // Therefore, we must send such message conditionally by checking if the emit is within expected range. + + // validEmit = totalEmitCounter <= outputVertices + const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode(); + auto validEmit = builder.CreateICmpULE(totalEmitCounter, builder.getInt32(geometryMode.outputVertices)); + + // Send the GS_EMIT message conditionally + builder.CreateIf(validEmit, false); + callInst.moveBefore(&*builder.GetInsertPoint()); + } } } } @@ -1099,10 +1114,10 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { // @param retInst : "Ret" instruction void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) { // We only handle the "ret" of shader entry point - if (m_shaderStage == ShaderStage::Invalid) + if (!m_shaderStage) return; - const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage); + const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value()); // Whether this shader stage has to use "exp" instructions to export outputs const bool useExpInst = ((m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval || @@ -1189,7 +1204,7 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) { unsigned clipDistanceCount = 0; unsigned cullDistanceCount = 0; - auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; + auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage; if (m_shaderStage == ShaderStage::Vertex) { auto &builtInUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Vertex)->builtInUsage.vs; @@ -1529,7 +1544,7 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) { // If we are building unlinked relocatable shaders, it is possible there are // generic outputs that are not written to. We need to count them in // the export count. - auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage); + auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value()); for (const auto &locInfoPair : resUsage->inOutUsage.outputLocInfoMap) { const unsigned newLoc = locInfoPair.second.getLocation(); if (m_expLocs.count(newLoc) != 0) @@ -3322,6 +3337,9 @@ void PatchInOutImportExport::patchTesBuiltInOutputExport(Value *output, unsigned // @param builder : the builder to use void PatchInOutImportExport::patchGsBuiltInOutputExport(Value *output, unsigned builtInId, unsigned streamId, BuilderBase &builder) { + if (streamId != m_pipelineState->getRasterizerState().rasterStream) + return; // Skip built-in export if this stream is not the rasterization stream. + const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry); const auto &builtInUsage = resUsage->builtInUsage.gs; const auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap; @@ -3781,7 +3799,7 @@ void PatchInOutImportExport::storeValueToStreamOutBuffer(Value *storeValue, unsi Value *writeIndex = nullptr; Value *streamOffset = nullptr; - const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs; + const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs; if (m_shaderStage == ShaderStage::Vertex) { streamInfo = getFunctionArgument(m_entryPoint, entryArgIdxs.vs.streamOutData.streamInfo); writeIndex = getFunctionArgument(m_entryPoint, entryArgIdxs.vs.streamOutData.writeIndex); @@ -3818,7 +3836,7 @@ void PatchInOutImportExport::storeValueToStreamOutBuffer(Value *storeValue, unsi streamOffset = builder.CreateShl(streamOffset, 2); // GPU will drop stream-out buffer store when the thread ID is invalid (OOB_select is set to SQ_OOB_INDEX_ONLY). - const unsigned outOfRangeWriteIndex = InvalidValue - (m_pipelineState->getShaderWaveSize(m_shaderStage) - 1); + const unsigned outOfRangeWriteIndex = InvalidValue - (m_pipelineState->getShaderWaveSize(m_shaderStage.value()) - 1); // validStreamOutVertex = threadId < streamOutVertexCount auto validStreamOutVertex = builder.CreateICmpULT(m_threadId, streamOutVertexCount); // writeIndex = validStreamOutVertex ? writeIndex : outOfRangeWriteIndex @@ -3893,7 +3911,7 @@ void PatchInOutImportExport::storeValueToEsGsRing(Value *storeValue, unsigned lo } // Call buffer store intrinsic or LDS store - const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs; + const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs; Value *esGsOffset = nullptr; if (m_shaderStage == ShaderStage::Vertex) esGsOffset = getFunctionArgument(m_entryPoint, entryArgIdxs.vs.esGsOffset); @@ -4022,7 +4040,7 @@ void PatchInOutImportExport::storeValueToGsVsRing(Value *storeValue, unsigned lo storeValue = builder.CreateBitCast(storeValue, builder.getInt32Ty()); } - const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs; + const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs; Value *gsVsOffset = getFunctionArgument(m_entryPoint, entryArgIdxs.gs.gsVsOffset); auto emitCounterPair = m_pipelineSysValues.get(m_entryPoint)->getEmitCounterPtr(); @@ -4224,8 +4242,8 @@ Value *PatchInOutImportExport::readValueFromLds(bool offChip, Type *readTy, Valu // Read from off-chip LDS buffer const auto &offChipLdsBaseArgIdx = m_shaderStage == ShaderStage::TessEval - ? m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs.tes.offChipLdsBase - : m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs.tcs.offChipLdsBase; + ? m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs.tes.offChipLdsBase + : m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs.tcs.offChipLdsBase; auto offChipLdsDesc = m_pipelineSysValues.get(m_entryPoint)->getOffChipLdsDesc(); @@ -4322,7 +4340,7 @@ void PatchInOutImportExport::writeValueToLds(bool offChip, Value *writeValue, Va if (offChip) { // Write to off-chip LDS buffer - auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs.tcs; + auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs.tcs; auto offChipLdsBase = getFunctionArgument(m_entryPoint, entryArgIdxs.offChipLdsBase); // Convert dword off-chip LDS offset to byte offset @@ -4528,7 +4546,7 @@ Value *PatchInOutImportExport::calcLdsOffsetForTesInput(Type *inputTy, unsigned auto outPatchStart = calcFactor.offChip.outPatchStart; auto patchConstStart = calcFactor.offChip.patchConstStart; - const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs.tes; + const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs.tes; auto relPatchId = getFunctionArgument(m_entryPoint, entryArgIdxs.relPatchId); @@ -4680,7 +4698,7 @@ unsigned PatchInOutImportExport::calcPatchCountPerThreadGroup(unsigned inVertexC void PatchInOutImportExport::addExportInstForGenericOutput(Value *output, unsigned location, unsigned compIdx, BuilderBase &builder) { // Check if the shader stage is valid to use "exp" instruction to export output - const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage); + const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value()); const bool useExpInst = ((m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval || m_shaderStage == ShaderStage::CopyShader) && (!nextStage || nextStage == ShaderStage::Fragment)); @@ -4874,7 +4892,7 @@ Value *PatchInOutImportExport::getSubgroupLocalInvocationId(BuilderBase &builder Value *subgroupLocalInvocationId = builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {builder.getInt32(-1), builder.getInt32(0)}); - unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); + unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value()); if (waveSize == 64) { subgroupLocalInvocationId = builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {builder.getInt32(-1), subgroupLocalInvocationId}); @@ -5401,7 +5419,7 @@ void PatchInOutImportExport::recordVertexAttribExport(unsigned location, ArrayRe m_attribExports[location][i] = attribValues[i]; // Update values that are valid } - auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; + auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage; inOutUsage.expCount = std::max(inOutUsage.expCount, location + 1); // Update export count } @@ -5413,7 +5431,7 @@ void PatchInOutImportExport::exportVertexAttribs(BuilderBase &builder) { assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval || m_shaderStage == ShaderStage::CopyShader); // Valid shader stages if (m_attribExports.empty()) { - assert(m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage.expCount == 0); + assert(m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage.expCount == 0); return; } diff --git a/lgc/patch/PatchInvariantLoads.cpp b/lgc/patch/LowerInvariantLoads.cpp similarity index 97% rename from lgc/patch/PatchInvariantLoads.cpp rename to lgc/patch/LowerInvariantLoads.cpp index 651d6a5ec7..770c717218 100644 --- a/lgc/patch/PatchInvariantLoads.cpp +++ b/lgc/patch/LowerInvariantLoads.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchInvariantLoads.cpp - * @brief LLPC source file: contains implementation of class lgc::PatchInvariantLoads. + * @file LowerInvariantLoads.cpp + * @brief LLPC source file: contains implementation of class lgc::LowerInvariantLoads. *********************************************************************************************************************** */ -#include "lgc/patch/PatchInvariantLoads.h" +#include "lgc/patch/LowerInvariantLoads.h" #include "lgc/patch/Patch.h" #include "lgc/state/PipelineState.h" #include "lgc/state/TargetInfo.h" @@ -37,7 +37,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" -#define DEBUG_TYPE "lgc-patch-invariant-loads" +#define DEBUG_TYPE "lgc-lower-invariant-loads" using namespace llvm; using namespace lgc; @@ -83,12 +83,12 @@ static unsigned findAddressSpaceAccess(const Instruction *inst) { // @param [in/out] function : Function that we will patch. // @param [in/out] analysisManager : Analysis manager to use for this transformation // @returns : The preserved analyses (The analyses that are still valid after this pass) -PreservedAnalyses PatchInvariantLoads::run(Function &function, FunctionAnalysisManager &analysisManager) { +PreservedAnalyses LowerInvariantLoads::run(Function &function, FunctionAnalysisManager &analysisManager) { const auto &moduleAnalysisManager = analysisManager.getResult(function); PipelineState *pipelineState = moduleAnalysisManager.getCachedResult(*function.getParent())->getPipelineState(); - LLVM_DEBUG(dbgs() << "Run the pass Patch-Invariant-Loads\n"); + LLVM_DEBUG(dbgs() << "Run the pass Lower-Invariant-Loads\n"); auto shaderStage = lgc::getShaderStage(&function); if (!shaderStage) diff --git a/lgc/patch/PatchMulDx9Zero.cpp b/lgc/patch/LowerMulDx9Zero.cpp similarity index 99% rename from lgc/patch/PatchMulDx9Zero.cpp rename to lgc/patch/LowerMulDx9Zero.cpp index b9cdb6f537..fa9495121d 100644 --- a/lgc/patch/PatchMulDx9Zero.cpp +++ b/lgc/patch/LowerMulDx9Zero.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchMulDx9Zero.cpp + * @file LowerMulDx9Zero.cpp * @brief LLPC source file: contains implementation of class lgc::PatchMulDx9Zero. *********************************************************************************************************************** */ -#include "lgc/patch/PatchMulDx9Zero.h" +#include "lgc/patch/LowerMulDx9Zero.h" #include "lgc/state/PipelineShaders.h" #include "lgc/state/PipelineState.h" #include "llvm/IR/Constants.h" diff --git a/lgc/patch/MeshTaskShader.cpp b/lgc/patch/MeshTaskShader.cpp index 0fcd139544..c289b82595 100644 --- a/lgc/patch/MeshTaskShader.cpp +++ b/lgc/patch/MeshTaskShader.cpp @@ -58,12 +58,6 @@ MeshTaskShader::MeshTaskShader(PipelineState *pipelineState, m_pipelineSysValues.initialize(m_pipelineState); } -// ===================================================================================================================== -// Destructor -MeshTaskShader::~MeshTaskShader() { - m_pipelineSysValues.clear(); -} - // ===================================================================================================================== // Layout mesh shader LDS if 'ldsLayout' is specified and calculate the required total LDS size (in dwords). // diff --git a/lgc/patch/MeshTaskShader.h b/lgc/patch/MeshTaskShader.h index 84ed5b1b12..35075fdf07 100644 --- a/lgc/patch/MeshTaskShader.h +++ b/lgc/patch/MeshTaskShader.h @@ -86,7 +86,6 @@ struct MeshOutputsLayout { class MeshTaskShader { public: MeshTaskShader(PipelineState *pipelineState, PatchPreparePipelineAbi::FunctionAnalysisHandlers *analysisHandlers); - ~MeshTaskShader(); static unsigned layoutMeshShaderLds(PipelineState *pipelineState, llvm::Function *entryPoint, MeshLdsLayout *ldsLayout = nullptr, MeshOutputsLayout *outputsLayout = nullptr); diff --git a/lgc/patch/PatchEntryPointMutate.cpp b/lgc/patch/MutateEntryPoint.cpp similarity index 94% rename from lgc/patch/PatchEntryPointMutate.cpp rename to lgc/patch/MutateEntryPoint.cpp index 2e0808f118..f69f32b56a 100644 --- a/lgc/patch/PatchEntryPointMutate.cpp +++ b/lgc/patch/MutateEntryPoint.cpp @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchEntryPointMutate.cpp - * @brief The lgc::PatchEntryPointMutate pass determines the final user data layout of shaders. + * @file MutateEntryPoint.cpp + * @brief The lgc::MutateEntryPoint pass determines the final user data layout of shaders. * * This consists of * - removing unused user data @@ -53,7 +53,7 @@ *********************************************************************************************************************** */ -#include "lgc/patch/PatchEntryPointMutate.h" +#include "lgc/patch/MutateEntryPoint.h" #include "ShaderMerger.h" #include "compilerutils/CompilerUtils.h" #include "lgc/LgcContext.h" @@ -80,21 +80,21 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include -#define DEBUG_TYPE "lgc-patch-entry-point-mutate" +#define DEBUG_TYPE "lgc-mutate-entry-point" using namespace llvm; using namespace lgc; using namespace cps; // ===================================================================================================================== -PatchEntryPointMutate::PatchEntryPointMutate() +MutateEntryPoint::MutateEntryPoint() : m_hasTs(false), m_hasGs(false), m_setInactiveChainArgId(Function::lookupIntrinsicID("llvm.amdgcn.set.inactive.chain.arg")) { } // ===================================================================================================================== -PatchEntryPointMutate::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::Twine &name, unsigned userDataValue, - unsigned *argIndex) +MutateEntryPoint::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::Twine &name, unsigned userDataValue, + unsigned *argIndex) : argTy(argTy), name(name.str()), userDataValue(userDataValue), argIndex(argIndex) { if (llvm::isa(argTy)) argDwordSize = argTy->getPointerAddressSpace() == ADDR_SPACE_CONST_32BIT ? 1 : 2; @@ -103,8 +103,8 @@ PatchEntryPointMutate::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::T } // ===================================================================================================================== -PatchEntryPointMutate::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::Twine &name, - UserDataMapping userDataValue, unsigned *argIndex) +MutateEntryPoint::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::Twine &name, UserDataMapping userDataValue, + unsigned *argIndex) : UserDataArg(argTy, name, static_cast(userDataValue), argIndex) { } @@ -114,11 +114,11 @@ PatchEntryPointMutate::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::T // @param [in/out] module : LLVM module to be run on // @param [in/out] analysisManager : Analysis manager to use for this transformation // @returns : The preserved analyses (The analyses that are still valid after this pass) -PreservedAnalyses PatchEntryPointMutate::run(Module &module, ModuleAnalysisManager &analysisManager) { +PreservedAnalyses MutateEntryPoint::run(Module &module, ModuleAnalysisManager &analysisManager) { PipelineState *pipelineState = analysisManager.getResult(module).getPipelineState(); PipelineShadersResult &pipelineShaders = analysisManager.getResult(module); - LLVM_DEBUG(dbgs() << "Run the pass Patch-Entry-Point-Mutate\n"); + LLVM_DEBUG(dbgs() << "Run the pass Mutate-Entry-Point\n"); Patch::init(&module); @@ -140,13 +140,13 @@ PreservedAnalyses PatchEntryPointMutate::run(Module &module, ModuleAnalysisManag if (m_pipelineState->isGraphics()) { // Process each shader in turn, but not the copy shader. - for (unsigned shaderStage = 0; shaderStage < ShaderStage::NativeStageCount; ++shaderStage) { - m_entryPoint = pipelineShaders.getEntryPoint(static_cast(shaderStage)); + for (auto stage : ShaderStagesNative) { + m_entryPoint = pipelineShaders.getEntryPoint(stage); if (m_entryPoint) { // ToDo: This should always be skipped since we don't implement CPS metadata yet. assert(!lgc::cps::isCpsFunction(*m_entryPoint) && "CPS support not implemented yet"); - m_shaderStage = static_cast(shaderStage); + m_shaderStage = stage; processShader(&shaderInputs); } } @@ -260,12 +260,12 @@ static Value *mergeDwordsIntoVector(IRBuilder<> &builder, ArrayRef inpu // Process LoadDriverTableEntryOp. // // @param module : LLVM module -void PatchEntryPointMutate::processDriverTableLoad(Module &module) { +void MutateEntryPoint::processDriverTableLoad(Module &module) { SmallVector callsToRemove; struct Payload { SmallVectorImpl &callsToRemove; - PatchEntryPointMutate *self; + MutateEntryPoint *self; }; Payload payload = {callsToRemove, this}; @@ -287,7 +287,7 @@ void PatchEntryPointMutate::processDriverTableLoad(Module &module) { // Lower LoadDriverTableEntryOp. // // @param loadDriverTablePtrOp : Call instruction to load driver table pointer -void PatchEntryPointMutate::lowerDriverTableLoad(LoadDriverTableEntryOp &loadDriverTablePtrOp) { +void MutateEntryPoint::lowerDriverTableLoad(LoadDriverTableEntryOp &loadDriverTablePtrOp) { BuilderBase builder(&loadDriverTablePtrOp); Function *entryPoint = loadDriverTablePtrOp.getFunction(); builder.SetInsertPoint(&loadDriverTablePtrOp); @@ -304,12 +304,12 @@ void PatchEntryPointMutate::lowerDriverTableLoad(LoadDriverTableEntryOp &loadDri // Process GroupMemcpyOp. // // @param module : LLVM module -void PatchEntryPointMutate::processGroupMemcpy(Module &module) { +void MutateEntryPoint::processGroupMemcpy(Module &module) { SmallVector callsToRemove; struct Payload { SmallVectorImpl &callsToRemove; - PatchEntryPointMutate *self; + MutateEntryPoint *self; }; Payload payload = {callsToRemove, this}; @@ -331,7 +331,7 @@ void PatchEntryPointMutate::processGroupMemcpy(Module &module) { // Lower GroupMemcpyOp - Copy memory using threads in a workgroup (scope=2) or subgroup (scope=3). // // @param groupMemcpyOp : Call instruction to do group memory copy -void PatchEntryPointMutate::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) { +void MutateEntryPoint::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) { BuilderImpl builder(m_pipelineState); Function *entryPoint = groupMemcpyOp.getFunction(); auto stage = getShaderStage(entryPoint); @@ -512,7 +512,7 @@ void PatchEntryPointMutate::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) { // Lower as.continuation.reference call. // // @param asCpsReferenceOp: the instruction -void PatchEntryPointMutate::lowerAsCpsReference(cps::AsContinuationReferenceOp &asCpsReferenceOp) { +void MutateEntryPoint::lowerAsCpsReference(cps::AsContinuationReferenceOp &asCpsReferenceOp) { BuilderBase builder(&asCpsReferenceOp); Value *reloc = nullptr; @@ -534,14 +534,14 @@ void PatchEntryPointMutate::lowerAsCpsReference(cps::AsContinuationReferenceOp & // @param shaderInputs: the ShaderInputs information for the parent function. This is only used for continufy based // continuation transform, under which we still need to pass ShaderInput arguments(WorkgroupId/LocalInvocationId) during // cps chain call. -bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInputs) { +bool MutateEntryPoint::lowerCpsOps(Function *func, ShaderInputs *shaderInputs) { SmallVector cpsJumps; SmallVector tobeErased; struct Payload { SmallVectorImpl &jumps; SmallVectorImpl &tobeErased; - PatchEntryPointMutate *self; + MutateEntryPoint *self; }; Payload payload = {cpsJumps, tobeErased, this}; @@ -657,7 +657,7 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInpu // Jump to next cps function. // ret: // ret void - unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); + unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value()); Type *waveMaskTy = builder.getIntNTy(waveSize); // For continufy based continuation, the vgpr list: LocalInvocationId(optional), vcr, vsp, ... unsigned vcrIndexInVgpr = haveLocalInvocationId ? 1 : 0; @@ -791,8 +791,8 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInpu // @param func : the cps function to be mutated // @param fixedShaderArgTys : the types of the fixed shader arguments(userdata + possibly shader inputs) // @param argNames : the name string of the fixed shader arguments -Function *PatchEntryPointMutate::lowerCpsFunction(Function *func, ArrayRef fixedShaderArgTys, - ArrayRef argNames) { +Function *MutateEntryPoint::lowerCpsFunction(Function *func, ArrayRef fixedShaderArgTys, + ArrayRef argNames) { Value *state = func->getArg(0); const DataLayout &layout = func->getParent()->getDataLayout(); IRBuilder<> builder(func->getContext()); @@ -889,8 +889,7 @@ Function *PatchEntryPointMutate::lowerCpsFunction(Function *func, ArrayRef &builder, Type *waveMaskTy, - ArrayRef priorties) { +Value *MutateEntryPoint::takeLevel(Value *level, IRBuilder<> &builder, Type *waveMaskTy, ArrayRef priorties) { auto levelMask = builder.CreateICmpNE(level, builder.getInt32(0)); Value *levelBallot = builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, waveMaskTy, levelMask); Value *cond = nullptr; @@ -911,8 +910,8 @@ Value *PatchEntryPointMutate::takeLevel(Value *level, IRBuilder<> &builder, Type // @param parent : the parent function of the cps.jump operation // @param jumpOp : the call instruction of cps.jump // @param [in/out] exitInfos : the vector of cps exit information to be filled -unsigned PatchEntryPointMutate::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, BasicBlock *tailBlock, - SmallVectorImpl &exitInfos) { +unsigned MutateEntryPoint::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, BasicBlock *tailBlock, + SmallVectorImpl &exitInfos) { IRBuilder<> builder(parent->getContext()); const DataLayout &layout = parent->getParent()->getDataLayout(); // Translate @lgc.cps.jump(CR %target, i32 %levels, T %state, ...) into: @@ -965,7 +964,7 @@ unsigned PatchEntryPointMutate::lowerCpsJump(Function *parent, cps::JumpOp *jump // are potentially used in other functions. It also modifies each call to pass the shader inputs between functions. // // @param module : IR module -void PatchEntryPointMutate::setupComputeWithCalls(Module *module) { +void MutateEntryPoint::setupComputeWithCalls(Module *module) { m_computeWithCalls = false; if (m_pipelineState->isComputeLibrary()) { @@ -1000,11 +999,11 @@ void PatchEntryPointMutate::setupComputeWithCalls(Module *module) { // Gather user data usage in all shaders // // @param module : IR module -void PatchEntryPointMutate::gatherUserDataUsage(Module *module) { +void MutateEntryPoint::gatherUserDataUsage(Module *module) { // Gather special ops requiring user data. static const auto visitor = - llvm_dialects::VisitorBuilder() - .add([](PatchEntryPointMutate &self, UserDataOp &op) { + llvm_dialects::VisitorBuilder() + .add([](MutateEntryPoint &self, UserDataOp &op) { auto stage = getShaderStage(op.getFunction()); assert(stage != ShaderStage::CopyShader); auto userDataUsage = self.getUserDataUsage(stage.value()); @@ -1064,7 +1063,7 @@ void PatchEntryPointMutate::gatherUserDataUsage(Module *module) { self.m_pipelineState->getPalMetadata()->setUserDataSpillUsage(op.getOffset() / 4, stage); } }) - .add([](PatchEntryPointMutate &self, LoadUserDataOp &op) { + .add([](MutateEntryPoint &self, LoadUserDataOp &op) { auto stage = getShaderStage(op.getFunction()); assert(stage != ShaderStage::CopyShader); auto *userDataUsage = self.getUserDataUsage(stage.value()); @@ -1111,8 +1110,8 @@ void PatchEntryPointMutate::gatherUserDataUsage(Module *module) { // ===================================================================================================================== // Load a value of a simple type from user data at the given dwordOffset. -Value *PatchEntryPointMutate::loadUserData(const UserDataUsage &userDataUsage, Value *spillTable, Type *type, - unsigned dwordOffset, BuilderBase &builder) { +Value *MutateEntryPoint::loadUserData(const UserDataUsage &userDataUsage, Value *spillTable, Type *type, + unsigned dwordOffset, BuilderBase &builder) { Function *func = builder.GetInsertBlock()->getParent(); unsigned dwordSize = m_module->getDataLayout().getTypeStoreSize(type) / 4; if (dwordOffset + dwordSize <= userDataUsage.entryArgIdxs.size()) { @@ -1163,7 +1162,7 @@ Value *PatchEntryPointMutate::loadUserData(const UserDataUsage &userDataUsage, V // spilled. // // @param module : IR module -void PatchEntryPointMutate::fixupUserDataUses(Module &module) { +void MutateEntryPoint::fixupUserDataUses(Module &module) { BuilderBase builder(module.getContext()); // For each function definition... @@ -1252,7 +1251,7 @@ void PatchEntryPointMutate::fixupUserDataUses(Module &module) { // Process a single shader // // @param shaderInputs : ShaderInputs object representing hardware-provided shader inputs -void PatchEntryPointMutate::processShader(ShaderInputs *shaderInputs) { +void MutateEntryPoint::processShader(ShaderInputs *shaderInputs) { // Create new entry-point from the original one SmallVector argTys; SmallVector argNames; @@ -1265,7 +1264,7 @@ void PatchEntryPointMutate::processShader(ShaderInputs *shaderInputs) { addFunctionArgs(origEntryPoint, origEntryPoint->getFunctionType()->getReturnType(), argTys, argNames, inRegMask); // We always deal with pre-merge functions here, so set the fitting pre-merge calling conventions. - switch (m_shaderStage) { + switch (m_shaderStage.value()) { case ShaderStage::Task: entryPoint->setCallingConv(CallingConv::AMDGPU_CS); break; @@ -1311,7 +1310,7 @@ void PatchEntryPointMutate::processShader(ShaderInputs *shaderInputs) { // // @param shaderInputs : ShaderInputs object representing hardware-provided shader inputs // @param [in/out] module : Module -void PatchEntryPointMutate::processComputeFuncs(ShaderInputs *shaderInputs, Module &module) { +void MutateEntryPoint::processComputeFuncs(ShaderInputs *shaderInputs, Module &module) { m_shaderStage = ShaderStage::Compute; // We no longer support compute shader fixed layout required before PAL interface version 624. @@ -1398,9 +1397,8 @@ void PatchEntryPointMutate::processComputeFuncs(ShaderInputs *shaderInputs, Modu // Process all real function calls and passes arguments to them. // // @param [in/out] module : Module -void PatchEntryPointMutate::processCalls(Function &func, ArrayRef shaderInputTys, - ArrayRef shaderInputNames, uint64_t inRegMask, - unsigned argOffset) { +void MutateEntryPoint::processCalls(Function &func, ArrayRef shaderInputTys, + ArrayRef shaderInputNames, uint64_t inRegMask, unsigned argOffset) { // This is one of: // - a compute pipeline with non-inlined functions; // - a compute pipeline with calls to library functions; @@ -1457,7 +1455,7 @@ void PatchEntryPointMutate::processCalls(Function &func, ArrayRef shader // ===================================================================================================================== // Set Attributes on new function -void PatchEntryPointMutate::setFuncAttrs(Function *entryPoint) { +void MutateEntryPoint::setFuncAttrs(Function *entryPoint) { AttrBuilder builder(entryPoint->getContext()); if (m_shaderStage == ShaderStage::Fragment) { auto &builtInUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Fragment)->builtInUsage.fs; @@ -1520,8 +1518,8 @@ void PatchEntryPointMutate::setFuncAttrs(Function *entryPoint) { } // Set VGPR, SGPR, and wave limits - auto shaderOptions = &m_pipelineState->getShaderOptions(m_shaderStage); - auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage); + auto shaderOptions = &m_pipelineState->getShaderOptions(m_shaderStage.value()); + auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value()); unsigned vgprLimit = shaderOptions->vgprLimit; unsigned sgprLimit = shaderOptions->sgprLimit; @@ -1552,7 +1550,7 @@ void PatchEntryPointMutate::setFuncAttrs(Function *entryPoint) { // Graphics shader stages don't have thread groups at an API level tgSize = 1; } - unsigned numWavesPerTg = divideCeil(tgSize, m_pipelineState->getShaderWaveSize(m_shaderStage)); + unsigned numWavesPerTg = divideCeil(tgSize, m_pipelineState->getShaderWaveSize(m_shaderStage.value())); unsigned maxWavesPerCu = numWavesPerTg * shaderOptions->maxThreadGroupsPerComputeUnit; unsigned maxWavesPerSimd = divideCeil(maxWavesPerCu, 2); std::string wavesPerEu = std::string("1,") + std::to_string(maxWavesPerSimd); @@ -1627,14 +1625,14 @@ void PatchEntryPointMutate::setFuncAttrs(Function *entryPoint) { // @returns inRegMask : "Inreg" bit mask for the arguments, with a bit set to indicate that the corresponding // arg needs to have an "inreg" attribute to put the arg into SGPRs rather than VGPRs // -uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInputs, Function *origFunc, - SmallVectorImpl &argTys, - SmallVectorImpl &argNames, unsigned argOffset, - bool updateUserDataMap) { +uint64_t MutateEntryPoint::generateEntryPointArgTys(ShaderInputs *shaderInputs, Function *origFunc, + SmallVectorImpl &argTys, + SmallVectorImpl &argNames, unsigned argOffset, + bool updateUserDataMap) { uint64_t inRegMask = 0; IRBuilder<> builder(*m_context); - auto intfData = m_pipelineState->getShaderInterfaceData(m_shaderStage); + auto intfData = m_pipelineState->getShaderInterfaceData(m_shaderStage.value()); auto &entryArgIdxs = intfData->entryArgIdxs; entryArgIdxs.initialized = true; @@ -1677,7 +1675,7 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp "Expecting descriptor set values to be one dword. The linker cannot handle anything else."); if (isSystemUserData) { unsigned index = userDataArg.userDataValue - static_cast(UserDataMapping::GlobalTable); - auto &specialUserData = getUserDataUsage(m_shaderStage)->specialUserData; + auto &specialUserData = getUserDataUsage(m_shaderStage.value())->specialUserData; if (index < specialUserData.size()) specialUserData[index].entryArgIdx = argTys.size() + argOffset; } @@ -1693,7 +1691,7 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp // Only applies to wave32 // TODO: Can we further exclude PS if LDS_GROUP_SIZE == 0 - if (m_pipelineState->getShaderWaveSize(m_shaderStage) == 32 && + if (m_pipelineState->getShaderWaveSize(m_shaderStage.value()) == 32 && (m_shaderStage == ShaderStage::Compute || m_shaderStage == ShaderStage::Fragment || m_shaderStage == ShaderStage::Mesh)) { unsigned userDataLimit = m_shaderStage == ShaderStage::Mesh ? 8 : 16; @@ -1711,8 +1709,8 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp // Push the fixed system (not user data) register args. if (shaderInputs) - inRegMask |= shaderInputs->getShaderArgTys(m_pipelineState, m_shaderStage, origFunc, m_computeWithCalls, argTys, - argNames, argOffset); + inRegMask |= shaderInputs->getShaderArgTys(m_pipelineState, m_shaderStage.value(), origFunc, m_computeWithCalls, + argTys, argNames, argOffset); if (updateUserDataMap) { constexpr unsigned NumUserSgprs = 32; @@ -1732,7 +1730,7 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp } userDataIdx += dwordSize; } - m_pipelineState->setUserDataMap(m_shaderStage, userDataMap); + m_pipelineState->setUserDataMap(m_shaderStage.value(), userDataMap); } return inRegMask; @@ -1741,7 +1739,7 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp // ===================================================================================================================== // @param userDataValue : The value to be written into a user data entry. // @returns : True if the user data value corresponds to a special system user data value. -bool PatchEntryPointMutate::isSystemUserDataValue(unsigned userDataValue) const { +bool MutateEntryPoint::isSystemUserDataValue(unsigned userDataValue) const { if (userDataValue < static_cast(UserDataMapping::GlobalTable)) { return false; } @@ -1751,7 +1749,7 @@ bool PatchEntryPointMutate::isSystemUserDataValue(unsigned userDataValue) const // ===================================================================================================================== // @param userDataValue : The value to be written into a user data entry. // @returns : True if the user data value corresponds to an unlinked descriptor set. -bool PatchEntryPointMutate::isUnlinkedDescriptorSetValue(unsigned userDataValue) const { +bool MutateEntryPoint::isUnlinkedDescriptorSetValue(unsigned userDataValue) const { if (userDataValue < static_cast(UserDataMapping::DescriptorSet0)) { return false; } @@ -1761,19 +1759,18 @@ bool PatchEntryPointMutate::isUnlinkedDescriptorSetValue(unsigned userDataValue) // ===================================================================================================================== // Add a UserDataArg to the appropriate vector for each special argument (e.g. ViewId) needed in user data SGPRs. // In here, we need to check whether an argument is needed in two ways: -// 1. Whether a flag is set saying it will be needed after PatchEntryPointMutate +// 1. Whether a flag is set saying it will be needed after MutateEntryPoint // 2. Whether there is an actual use of the special user data value (lgc.special.user.data call) generated -// before PatchEntryPointMutate, which we check with userDataUsage->isSpecialUserDataUsed(). +// before MutateEntryPoint, which we check with userDataUsage->isSpecialUserDataUsed(). // // @param userDataArgs : Vector to add args to when they need to go before user data nodes (just streamout) // @param specialUserDataArgs : Vector to add args to when they need to go after user data nodes (all the rest) // @param builder : IRBuilder to get types from -void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl &userDataArgs, - SmallVectorImpl &specialUserDataArgs, - IRBuilder<> &builder) { +void MutateEntryPoint::addSpecialUserDataArgs(SmallVectorImpl &userDataArgs, + SmallVectorImpl &specialUserDataArgs, IRBuilder<> &builder) { - auto userDataUsage = getUserDataUsage(m_shaderStage); - auto intfData = m_pipelineState->getShaderInterfaceData(m_shaderStage); + auto userDataUsage = getUserDataUsage(m_shaderStage.value()); + auto intfData = m_pipelineState->getShaderInterfaceData(m_shaderStage.value()); auto &entryArgIdxs = intfData->entryArgIdxs; bool enableNgg = m_pipelineState->isGraphics() ? m_pipelineState->getNggControl()->enableNgg : false; @@ -1788,7 +1785,7 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl if (m_pipelineState->getInputAssemblyState().multiView != MultiViewMode::Disable) { unsigned *argIdx = nullptr; auto userDataValue = UserDataMapping::ViewId; - switch (m_shaderStage) { + switch (m_shaderStage.value()) { case ShaderStage::Vertex: argIdx = &entryArgIdxs.vs.viewId; break; @@ -1807,7 +1804,7 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "viewId", userDataValue, argIdx)); } - if (getMergedShaderStage(m_shaderStage) == getMergedShaderStage(ShaderStage::Vertex)) { + if (getMergedShaderStage(m_shaderStage.value()) == getMergedShaderStage(ShaderStage::Vertex)) { // This is the VS, or the shader that VS is merged into on GFX9+. auto vsIntfData = m_pipelineState->getShaderInterfaceData(ShaderStage::Vertex); auto vsResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Vertex); @@ -1917,7 +1914,7 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl // If no NGG, stream out table will be set to copy shader's user data entry, we should not set it duplicately. unsigned *tablePtr = nullptr; - switch (m_shaderStage) { + switch (m_shaderStage.value()) { case ShaderStage::Vertex: tablePtr = &intfData->entryArgIdxs.vs.streamOutData.tablePtr; break; @@ -1951,7 +1948,7 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl if (m_pipelineState->enableSwXfb()) { unsigned *controlBufPtr = nullptr; - switch (m_shaderStage) { + switch (m_shaderStage.value()) { case ShaderStage::Vertex: controlBufPtr = &intfData->entryArgIdxs.vs.streamOutData.controlBufPtr; break; @@ -1980,9 +1977,9 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl // of user data arguments // @param specialUserDataArgs : list of suffix "system value" user data arguments // @param builder : IRBuilder to get types from -void PatchEntryPointMutate::finalizeUserDataArgs(SmallVectorImpl &userDataArgs, - ArrayRef specialUserDataArgs, IRBuilder<> &builder) { - auto userDataUsage = getUserDataUsage(m_shaderStage); +void MutateEntryPoint::finalizeUserDataArgs(SmallVectorImpl &userDataArgs, + ArrayRef specialUserDataArgs, IRBuilder<> &builder) { + auto userDataUsage = getUserDataUsage(m_shaderStage.value()); // In compute-with-calls, we need to ensure that the compute shader and library code agree that s15 is the spill // table pointer, even if it is not needed, because library code does not know whether a spill table pointer is @@ -2120,9 +2117,8 @@ void PatchEntryPointMutate::finalizeUserDataArgs(SmallVectorImpl &u // Get UserDataUsage struct for the merged shader stage that contains the given shader stage // // @param stage : Shader stage -PatchEntryPointMutate::UserDataUsage *PatchEntryPointMutate::getUserDataUsage(ShaderStageEnum stage) { +MutateEntryPoint::UserDataUsage *MutateEntryPoint::getUserDataUsage(ShaderStageEnum stage) { stage = getMergedShaderStage(stage); - m_userDataUsage.resize(std::max(m_userDataUsage.size(), static_cast(stage) + 1)); if (!m_userDataUsage[stage]) m_userDataUsage[stage] = std::make_unique(); return &*m_userDataUsage[stage]; @@ -2136,7 +2132,7 @@ PatchEntryPointMutate::UserDataUsage *PatchEntryPointMutate::getUserDataUsage(Sh // TES -> GS (if it exists) // // @param stage : Shader stage -ShaderStageEnum PatchEntryPointMutate::getMergedShaderStage(ShaderStageEnum stage) const { +ShaderStageEnum MutateEntryPoint::getMergedShaderStage(ShaderStageEnum stage) const { switch (stage) { case ShaderStage::Vertex: if (m_pipelineState->hasShaderStage(ShaderStage::TessControl)) @@ -2153,18 +2149,18 @@ ShaderStageEnum PatchEntryPointMutate::getMergedShaderStage(ShaderStageEnum stag } // ===================================================================================================================== -bool PatchEntryPointMutate::isComputeWithCalls() const { +bool MutateEntryPoint::isComputeWithCalls() const { return m_computeWithCalls; } // ===================================================================================================================== -bool PatchEntryPointMutate::UserDataUsage::isSpecialUserDataUsed(UserDataMapping kind) { +bool MutateEntryPoint::UserDataUsage::isSpecialUserDataUsed(UserDataMapping kind) { unsigned index = static_cast(kind) - static_cast(UserDataMapping::GlobalTable); return specialUserData.size() > index && !specialUserData[index].users.empty(); } // ===================================================================================================================== -void PatchEntryPointMutate::UserDataUsage::addLoad(unsigned dwordOffset, unsigned dwordSize) { +void MutateEntryPoint::UserDataUsage::addLoad(unsigned dwordOffset, unsigned dwordSize) { assert(dwordOffset + dwordSize <= 256 && "shader uses a user data region that is too large"); if (dwordOffset + dwordSize > loadSizes.size()) diff --git a/lgc/patch/NggPrimShader.cpp b/lgc/patch/NggPrimShader.cpp index a1e25e904d..73207cd6b0 100644 --- a/lgc/patch/NggPrimShader.cpp +++ b/lgc/patch/NggPrimShader.cpp @@ -246,9 +246,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin // ES-GS ring if (ldsLayout) { - // NOTE: We round ES-GS LDS size to 4-dword alignment. This is for later LDS read/write operations of mutilple - // dwords (such as DS128). - ldsRegionSize = alignTo(calcFactor.esGsLdsSize, 4U); + ldsRegionSize = calcFactor.esGsLdsSize; printLdsRegionInfo("ES-GS Ring", ldsOffset, ldsRegionSize); (*ldsLayout)[PrimShaderLdsRegion::EsGsRing] = std::make_pair(ldsOffset, ldsRegionSize); @@ -7049,7 +7047,6 @@ void NggPrimShader::prepareSwXfb(ArrayRef primCountInSubgroup) { } Value *dwordsWritten[MaxTransformFeedbackBuffers] = {}; - Value *dwordsPerPrim[MaxTransformFeedbackBuffers] = {}; // Calculate numPrimsToWrite for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) { @@ -7090,20 +7087,14 @@ void NggPrimShader::prepareSwXfb(ArrayRef primCountInSubgroup) { dwordsRemaining = m_builder.CreateIntrinsic(Intrinsic::smax, dwordsRemaining->getType(), {dwordsRemaining, m_builder.getInt32(0)}); // numPrimsToWrite = min(dwordsRemaining / dwordsPerPrim, numPrimsToWrite) - dwordsPerPrim[i] = + Value *dwordsPerPrim = m_builder.CreateMul(m_verticesPerPrimitive, m_builder.getInt32(xfbStrides[i] / sizeof(unsigned))); - Value *primsCanWrite = m_builder.CreateUDiv(dwordsRemaining, dwordsPerPrim[i]); + Value *primsCanWrite = m_builder.CreateUDiv(dwordsRemaining, dwordsPerPrim); numPrimsToWrite[xfbBufferToStream[i]] = m_builder.CreateIntrinsic(Intrinsic::umin, numPrimsToWrite[xfbBufferToStream[i]]->getType(), {numPrimsToWrite[xfbBufferToStream[i]], primsCanWrite}); - } - - // Increment dwordsWritten - for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) { - if (!bufferActive[i]) - continue; - Value *dwordsToWrite = m_builder.CreateMul(numPrimsToWrite[xfbBufferToStream[i]], dwordsPerPrim[i]); + Value *dwordsToWrite = m_builder.CreateMul(numPrimsToWrite[xfbBufferToStream[i]], dwordsPerPrim); if (i == lastActiveBuffer) { // ds_ordered_count, wave done diff --git a/lgc/patch/PassRegistry.inc b/lgc/patch/PassRegistry.inc index c2018e9f34..322336a266 100644 --- a/lgc/patch/PassRegistry.inc +++ b/lgc/patch/PassRegistry.inc @@ -57,18 +57,18 @@ LLPC_MODULE_PASS("lgc-builder-replayer", BuilderReplayer) LLPC_MODULE_PASS("lgc-continufy", Continufy) LLPC_MODULE_PASS("lgc-patch-resource-collect", PatchResourceCollect) LLPC_MODULE_PASS("lgc-patch-initialize-workgroup-memory", PatchInitializeWorkgroupMemory) -LLPC_MODULE_PASS("lgc-patch-image-derivatives", PatchImageDerivatives) +LLPC_MODULE_PASS("lgc-lower-image-derivatives", LowerImageDerivatives) LLPC_MODULE_PASS("lgc-patch-in-out-import-export", PatchInOutImportExport) -LLPC_FUNCTION_PASS("lgc-patch-invariant-loads", PatchInvariantLoads) +LLPC_FUNCTION_PASS("lgc-lower-invariant-loads", LowerInvariantLoads) LLPC_MODULE_PASS("lgc-patch-setup-target-features", PatchSetupTargetFeatures) -LLPC_MODULE_PASS("lgc-patch-copy-shader", PatchCopyShader) +LLPC_MODULE_PASS("lgc-generate-copy-shader", GenerateCopyShader) LLPC_MODULE_PASS("lgc-patch-prepare-pipeline-abi", PatchPreparePipelineAbi) LLPC_FUNCTION_PASS("lgc-patch-read-first-lane", PatchReadFirstLane) LLPC_MODULE_PASS("lgc-patch-llvm-ir-inclusion", PatchLlvmIrInclusion) LLPC_FUNCTION_PASS("lgc-patch-peephole-opt", PatchPeepholeOpt) LLPC_MODULE_PASS("lgc-lower-subgroup-ops", LowerSubgroupOps) -LLPC_MODULE_PASS("lgc-patch-entry-point-mutate", PatchEntryPointMutate) -LLPC_MODULE_PASS("lgc-patch-check-shader-cache", PatchCheckShaderCache) +LLPC_MODULE_PASS("lgc-mutate-entry-point", MutateEntryPoint) +LLPC_MODULE_PASS("lgc-patch-check-shader-cache", CheckShaderCache) LLPC_LOOP_PASS("lgc-patch-loop-metadata", PatchLoopMetadata) LLPC_FUNCTION_PASS("lgc-patch-buffer-op", PatchBufferOp) LLPC_MODULE_PASS("lgc-patch-workarounds", PatchWorkarounds) diff --git a/lgc/patch/Patch.cpp b/lgc/patch/Patch.cpp index fbb01c5c57..bac8f27977 100644 --- a/lgc/patch/Patch.cpp +++ b/lgc/patch/Patch.cpp @@ -37,33 +37,34 @@ #include "lgc/PassManager.h" #include "lgc/Pipeline.h" #include "lgc/builder/BuilderReplayer.h" +#include "lgc/patch/AddLoopMetadata.h" +#include "lgc/patch/CheckShaderCache.h" +#include "lgc/patch/CollectImageOperations.h" #include "lgc/patch/Continufy.h" #include "lgc/patch/FragColorExport.h" +#include "lgc/patch/GenerateCopyShader.h" +#include "lgc/patch/IncludeLlvmIr.h" #include "lgc/patch/LowerDebugPrintf.h" #include "lgc/patch/LowerDesc.h" #include "lgc/patch/LowerGpuRt.h" +#include "lgc/patch/LowerImageDerivatives.h" +#include "lgc/patch/LowerInOut.h" +#include "lgc/patch/LowerInvariantLoads.h" +#include "lgc/patch/LowerMulDx9Zero.h" #include "lgc/patch/LowerSubgroupOps.h" +#include "lgc/patch/MutateEntryPoint.h" #include "lgc/patch/PatchBufferOp.h" -#include "lgc/patch/PatchCheckShaderCache.h" -#include "lgc/patch/PatchCopyShader.h" -#include "lgc/patch/PatchEntryPointMutate.h" -#include "lgc/patch/PatchImageDerivatives.h" -#include "lgc/patch/PatchImageOpCollect.h" -#include "lgc/patch/PatchInOutImportExport.h" #include "lgc/patch/PatchInitializeWorkgroupMemory.h" -#include "lgc/patch/PatchInvariantLoads.h" -#include "lgc/patch/PatchLlvmIrInclusion.h" -#include "lgc/patch/PatchLoadScalarizer.h" -#include "lgc/patch/PatchLoopMetadata.h" -#include "lgc/patch/PatchMulDx9Zero.h" #include "lgc/patch/PatchPeepholeOpt.h" #include "lgc/patch/PatchPreparePipelineAbi.h" #include "lgc/patch/PatchReadFirstLane.h" #include "lgc/patch/PatchResourceCollect.h" #include "lgc/patch/PatchSetupTargetFeatures.h" #include "lgc/patch/PatchWorkarounds.h" +#include "lgc/patch/ScalarizeLoads.h" #include "lgc/patch/TcsPassthroughShader.h" #include "lgc/patch/VertexFetch.h" + #if LLPC_BUILD_STRIX1 #include "lgc/patch/WorkaroundDsSubdwordWrite.h" #endif @@ -200,23 +201,23 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T passMgr.addPass(PatchNullFragShader()); passMgr.addPass(PatchResourceCollect()); // also removes inactive/unused resources - // PatchCheckShaderCache depends on PatchResourceCollect - passMgr.addPass(PatchCheckShaderCache(std::move(checkShaderCacheFunc))); + // CheckShaderCache depends on PatchResourceCollect + passMgr.addPass(CheckShaderCache(std::move(checkShaderCacheFunc))); // First part of lowering to "AMDGCN-style" passMgr.addPass(PatchWorkarounds()); - passMgr.addPass(PatchCopyShader()); + passMgr.addPass(GenerateCopyShader()); passMgr.addPass(LowerVertexFetch()); passMgr.addPass(LowerFragColorExport()); passMgr.addPass(LowerDebugPrintf()); passMgr.addPass(LowerDesc()); - passMgr.addPass(PatchEntryPointMutate()); + passMgr.addPass(MutateEntryPoint()); passMgr.addPass(createModuleToFunctionPassAdaptor(LowerPopsInterlock())); passMgr.addPass(PatchInitializeWorkgroupMemory()); passMgr.addPass(PatchInOutImportExport()); // Patch invariant load and loop metadata. - passMgr.addPass(createModuleToFunctionPassAdaptor(PatchInvariantLoads())); + passMgr.addPass(createModuleToFunctionPassAdaptor(LowerInvariantLoads())); passMgr.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(PatchLoopMetadata()))); #if LLPC_BUILD_STRIX1 @@ -276,7 +277,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T passMgr.addPass(createModuleToFunctionPassAdaptor(std::move(fpm))); } - passMgr.addPass(PatchImageDerivatives()); + passMgr.addPass(LowerImageDerivatives()); // Set up target features in shader entry-points. // NOTE: Needs to be done after post-NGG function inlining, because LLVM refuses to inline something @@ -486,7 +487,7 @@ void Patch::addOptimizationPasses(lgc::PassManager &passMgr, uint32_t optLevel) void Patch::init(Module *module) { m_module = module; m_context = &m_module->getContext(); - m_shaderStage = ShaderStage::Invalid; + m_shaderStage = std::nullopt; m_entryPoint = nullptr; } diff --git a/lgc/patch/PatchBufferOp.cpp b/lgc/patch/PatchBufferOp.cpp index c6948a2258..d5b7c1dac2 100644 --- a/lgc/patch/PatchBufferOp.cpp +++ b/lgc/patch/PatchBufferOp.cpp @@ -775,11 +775,14 @@ void BufferOpLowering::visitStridedBufferAddrAndStrideToPtr(StridedBufferAddrAnd // @param loadDescToPtr : The instruction void BufferOpLowering::visitBufferLoadDescToPtr(BufferLoadDescToPtrOp &loadDescToPtr) { m_builder.SetInsertPoint(&loadDescToPtr); - Value *descriptor = - createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact()); - - if (loadDescToPtr.getIsCompact()) - descriptor = createCompactDesc(descriptor, nullptr); + bool needLoadDesc = true; + Value *descriptor = loadDescToPtr.getDescPtr(); + if (needLoadDesc) { + descriptor = + createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact()); + if (loadDescToPtr.getIsCompact()) + descriptor = createCompactDesc(descriptor, nullptr); + } m_typeLowering.replaceInstruction(&loadDescToPtr, {descriptor, ConstantPointerNull::get(m_offsetType)}); @@ -804,11 +807,15 @@ void BufferOpLowering::visitBufferLoadDescToPtr(BufferLoadDescToPtrOp &loadDescT // @param loadDescToPtr : The instruction void BufferOpLowering::visitStridedBufferLoadDescToPtr(StridedBufferLoadDescToPtrOp &loadDescToPtr) { m_builder.SetInsertPoint(&loadDescToPtr); - Value *descriptor = - createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact()); - - if (loadDescToPtr.getIsCompact()) - descriptor = createCompactDesc(descriptor, loadDescToPtr.getStride()); + bool needLoadDesc = true; + Value *descriptor = loadDescToPtr.getDescPtr(); + if (needLoadDesc) { + descriptor = + createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact()); + + if (loadDescToPtr.getIsCompact()) + descriptor = createCompactDesc(descriptor, loadDescToPtr.getStride()); + } m_typeLowering.replaceInstruction(&loadDescToPtr, {descriptor, ConstantPointerNull::get(m_offsetType), m_builder.getInt32(0)}); @@ -1486,6 +1493,7 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) { auto pointerValues = m_typeLowering.getValue(pointerOperand); Value *const bufferDesc = pointerValues[0]; + const bool isIndexedDesc = isa(bufferDesc->getType()); const DataLayout &dataLayout = m_builder.GetInsertBlock()->getModule()->getDataLayout(); @@ -1502,9 +1510,10 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) { const bool isDlc = isGlc; // For buffer load on GFX10+, we set DLC = GLC Value *const baseIndex = m_builder.CreatePtrToInt(pointerValues[1], m_builder.getInt32Ty()); + const bool isDivergentDesc = getDescriptorInfo(bufferDesc).divergent.value(); - // If our buffer descriptor is divergent, need to handle that differently. - if (getDescriptorInfo(bufferDesc).divergent.value()) { + if (!isIndexedDesc && isDivergentDesc) { + // If our buffer descriptor is divergent, need to handle that differently in non resource indexing mode. auto createLoadStoreFunc = [&](Value *pointer) { Value *result = nullptr; if (isLoad) { @@ -1588,6 +1597,14 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) { } } + auto getBufferDesc = [&]() -> Value * { + if (isIndexedDesc) { + auto address = m_builder.CreatePtrToInt(bufferDesc, m_builder.getInt64Ty()); + return m_builder.CreateTrunc(address, m_builder.getInt32Ty()); + } + return bufferDesc; + }; + // The index in storeValue which we use next unsigned storeIndex = 0; @@ -1635,49 +1652,51 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) { } if (isLoad) { + bool accessSizeAllowed = true; if (m_pipelineState.getTargetInfo().getGfxIpVersion().major <= 11) { // TODO For stores? coherent.bits.dlc = isDlc; + accessSizeAllowed = accessSize >= 4; } - if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) { - Value *indexValue = pointerValues[2]; - CallInst *call = nullptr; - // Especially when the index is a constant, and the stride is known at compile-time, - // we should create s_buffer_load instructions with constant offsets: index * stride + offset - if ((isInvariant && accessSize >= 4) && isa(indexValue)) { - Value *desc1 = m_builder.CreateExtractElement(bufferDesc, 1); + + Value *indexValue = pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER + ? pointerValues[2] + : nullptr; + if (isInvariant && !isDivergentDesc && accessSizeAllowed) { + // create s.buffer.load + Value *desc = bufferDesc; + if (isIndexedDesc) + desc = m_builder.CreateLoad(FixedVectorType::get(m_builder.getInt32Ty(), 4), bufferDesc); + if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) { + // Especially when the index is a constant, and the stride is known at compile-time, + // we should create s_buffer_load instructions with constant offsets: index * stride + offset + assert(isa(indexValue)); + Value *desc1 = m_builder.CreateExtractElement(desc, 1); // stride is 61:48 bits in descriptor, which will always be constantInt when create BufferDesc Value *stride = m_builder.CreateAnd(m_builder.CreateLShr(desc1, m_builder.getInt32(16)), m_builder.getInt32(0x3fff)); Value *indexOffsetVal = m_builder.CreateMul(indexValue, stride); offsetVal = m_builder.CreateAdd(offsetVal, indexOffsetVal); - call = m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_buffer_load, intAccessType, - {bufferDesc, offsetVal, m_builder.getInt32(coherent.u32All)}); - } else { - call = m_builder.CreateIntrinsic( - Intrinsic::amdgcn_struct_buffer_load, intAccessType, - {bufferDesc, indexValue, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)}); } - copyMetadata(call, &inst); - if (isInvariant) - call->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {})); - part = call; - } else if (isInvariant && accessSize >= 4) { + CallInst *call = m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_buffer_load, intAccessType, - {bufferDesc, offsetVal, m_builder.getInt32(coherent.u32All)}); + {desc, offsetVal, m_builder.getInt32(coherent.u32All)}); call->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {})); + copyMetadata(call, &inst); part = call; } else { - unsigned intrinsicID = Intrinsic::amdgcn_raw_buffer_load; -#if !defined(LLVM_HAVE_BRANCH_AMD_GFX) -#warning[!amd-gfx] Atomic load loses memory semantics -#else - if (ordering != AtomicOrdering::NotAtomic) - intrinsicID = Intrinsic::amdgcn_raw_atomic_buffer_load; -#endif - part = m_builder.CreateIntrinsic( - intrinsicID, intAccessType, - {bufferDesc, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)}); + if (indexValue) { + part = m_builder.CreateIntrinsic( + Intrinsic::amdgcn_struct_buffer_load, intAccessType, + {getBufferDesc(), indexValue, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)}); + } else { + unsigned intrinsicID = Intrinsic::amdgcn_raw_buffer_load; + if (ordering != AtomicOrdering::NotAtomic) + intrinsicID = Intrinsic::amdgcn_raw_atomic_buffer_load; + part = m_builder.CreateIntrinsic( + intrinsicID, intAccessType, + {getBufferDesc(), offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)}); + } } } else { // Store @@ -1692,12 +1711,12 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) { copyMetadata(part, &inst); if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) { part = m_builder.CreateIntrinsic(Intrinsic::amdgcn_struct_buffer_store, intAccessType, - {part, bufferDesc, pointerValues[2], offsetVal, m_builder.getInt32(0), + {part, getBufferDesc(), pointerValues[2], offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)}); } else { part = m_builder.CreateIntrinsic( Intrinsic::amdgcn_raw_buffer_store, intAccessType, - {part, bufferDesc, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)}); + {part, getBufferDesc(), offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)}); } } diff --git a/lgc/patch/PatchInitializeWorkgroupMemory.cpp b/lgc/patch/PatchInitializeWorkgroupMemory.cpp index fd81de41e9..1d5cc9ce8f 100644 --- a/lgc/patch/PatchInitializeWorkgroupMemory.cpp +++ b/lgc/patch/PatchInitializeWorkgroupMemory.cpp @@ -79,7 +79,7 @@ PreservedAnalyses PatchInitializeWorkgroupMemory::run(Module &module, ModuleAnal Patch::init(&module); m_shaderStage = ShaderStage::Compute; - m_entryPoint = pipelineShaders.getEntryPoint(static_cast(m_shaderStage)); + m_entryPoint = pipelineShaders.getEntryPoint(m_shaderStage.value()); BuilderBase builder(*m_context); builder.SetInsertPointPastAllocas(m_entryPoint); @@ -133,7 +133,7 @@ void PatchInitializeWorkgroupMemory::initializeWithZero(GlobalVariable *lds, Bui builder.SetInsertPoint(originBlock->getTerminator()); // Get thread info auto &shaderMode = m_pipelineState->getShaderModes()->getComputeShaderMode(); - const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs; + const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs; Value *localInvocationId = getFunctionArgument(m_entryPoint, entryArgIdxs.cs.localInvocationId); const unsigned actualNumThreads = shaderMode.workgroupSizeX * shaderMode.workgroupSizeY * shaderMode.workgroupSizeZ; diff --git a/lgc/patch/PatchResourceCollect.cpp b/lgc/patch/PatchResourceCollect.cpp index 6ba2a49442..85d8ec0f06 100644 --- a/lgc/patch/PatchResourceCollect.cpp +++ b/lgc/patch/PatchResourceCollect.cpp @@ -41,6 +41,7 @@ #include "lgc/util/BuilderBase.h" #include "lgc/util/Debug.h" #include "llvm-dialects/Dialect/Visitor.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -94,8 +95,7 @@ PreservedAnalyses PatchResourceCollect::run(Module &module, ModuleAnalysisManage m_tcsInputHasDynamicIndexing = false; bool needPack = false; - for (int shaderStage = 0; shaderStage < ShaderStage::GfxCount; ++shaderStage) { - ShaderStageEnum stage = static_cast(shaderStage); + for (auto stage : ShaderStagesGraphics) { if (pipelineState->hasShaderStage(stage) && (pipelineState->canPackInput(stage) || pipelineState->canPackOutput(stage))) { needPack = true; @@ -109,9 +109,9 @@ PreservedAnalyses PatchResourceCollect::run(Module &module, ModuleAnalysisManage } // Process each shader stage, in reverse order. We process FS even if it does not exist (part-pipeline compile). - for (int shaderStage = ShaderStage::CountInternal - 1; shaderStage >= 0; --shaderStage) { - m_entryPoint = pipelineShaders.getEntryPoint(static_cast(shaderStage)); - m_shaderStage = static_cast(shaderStage); + for (auto shaderStage : llvm::reverse(ShaderStagesNativeCopy)) { + m_entryPoint = pipelineShaders.getEntryPoint(shaderStage); + m_shaderStage = shaderStage; if (m_entryPoint) processShader(); else if (m_shaderStage == ShaderStage::Fragment) @@ -538,7 +538,9 @@ bool PatchResourceCollect::checkGsOnChipValidity() { // NOTE: Make gsVsVertexItemSize odd by "| 1", to optimize GS -> VS ring layout for LDS bank conflicts. unsigned gsVsVertexItemTotalSize = 0; for (int i = 0; i < MaxGsStreams; ++i) { - gsVsVertexItemSize[i] = (4 * gsResUsage->inOutUsage.gs.outLocCount[i]) | 1; + gsVsVertexItemSize[i] = 4 * gsResUsage->inOutUsage.gs.outLocCount[i]; + if (gsVsVertexItemSize[i] != 0) + gsVsVertexItemSize[i] |= 1; // If vertex item size is 0, this stream is inactive without any export. gsVsVertexItemTotalSize += gsVsVertexItemSize[i]; } @@ -631,6 +633,9 @@ bool PatchResourceCollect::checkGsOnChipValidity() { assert(gsInstanceCount == 1); } + // The minimum number of esVertsPerSubgroup must be at least the number of vertices per primitive. + esVertsPerSubgroup = std::max(inVertsPerPrim, esVertsPerSubgroup); + // NOTE: If ray query uses LDS stack, the expected max thread count in the group is 64. And we force wave size // to be 64 in order to keep all threads in the same wave. In the future, we could consider to get rid of this // restriction by providing the capability of querying thread ID in the group rather than in wave. @@ -952,69 +957,133 @@ bool PatchResourceCollect::checkGsOnChipValidity() { } LLPC_OUTS("===============================================================================\n"); - LLPC_OUTS("// LLPC geometry calculation factor results\n\n"); - LLPC_OUTS("ES vertices per subgroup: " << gsResUsage->inOutUsage.gs.calcFactor.esVertsPerSubgroup << "\n"); - LLPC_OUTS("GS primitives per subgroup: " << gsResUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup << "\n"); + LLPC_OUTS("// LLPC HW GS configurations\n\n"); + LLPC_OUTS("HW GS = "); + if (meshPipeline) { + LLPC_OUTS("Mesh shader\n"); + } else if (m_pipelineState->getNggControl()->enableNgg) { + LLPC_OUTS((hasGs ? "NGG GS" : "NGG") << "\n"); + } else { + LLPC_OUTS("Legacy GS (" << (gsOnChip ? "Onchip" : "Offchip") << ")\n"); + } LLPC_OUTS("\n"); - LLPC_OUTS("ES-GS LDS size (in dwords): " << gsResUsage->inOutUsage.gs.calcFactor.esGsLdsSize << "\n"); - LLPC_OUTS("On-chip GS LDS size (in dwords): " << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << "\n"); + + LLPC_OUTS("EsVerts = " << gsResUsage->inOutUsage.gs.calcFactor.esVertsPerSubgroup << " verts/subgroup\n"); + LLPC_OUTS("GsPrims = " << gsResUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup << " prims/subgroup\n"); + LLPC_OUTS("\n"); + + LLPC_OUTS("EsGsLdsSize = " << gsResUsage->inOutUsage.gs.calcFactor.esGsLdsSize << " dwords\n"); + LLPC_OUTS("GsOnchipLdsSize = " << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << " dwords\n"); + if (gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize > 0) { + LLPC_OUTS("RayQueryLdsStack = " << gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize << " dwords (Start = " + << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << ")\n"); + } LLPC_OUTS("\n"); - LLPC_OUTS("ES-GS ring item size (in dwords): " << gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize << "\n"); - LLPC_OUTS("GS-VS ring item size (in dwords): " << gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize << "\n"); + + LLPC_OUTS("EsGsRingItemSize = " << gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize << " dwords\n"); + LLPC_OUTS("GsVsRingItemSize = " << gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize << " dwords\n"); + LLPC_OUTS("GsVsVertexItemSizes = ["); + for (unsigned i = 0; i < MaxGsStreams; ++i) { + LLPC_OUTS(gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i]); + LLPC_OUTS((i == MaxGsStreams - 1 ? "" : ", ")); + } + LLPC_OUTS("] dwords\n"); LLPC_OUTS("\n"); + if (meshPipeline || m_pipelineState->getNggControl()->enableNgg) { + LLPC_OUTS("PrimAmpFactor = " << gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor << "\n"); + LLPC_OUTS("EnableMaxVertOut = " << (gsResUsage->inOutUsage.gs.calcFactor.enableMaxVertOut ? "true" : "false") + << "\n"); + LLPC_OUTS("\n"); + } + if (hasGs) { - LLPC_OUTS("GS stream item sizes (in dwords):\n"); + LLPC_OUTS("InputPrimitive = "); + switch (geometryMode.inputPrimitive) { + case InputPrimitives::Points: + LLPC_OUTS("Points\n"); + break; + case InputPrimitives::Lines: + LLPC_OUTS("Lines\n"); + break; + case InputPrimitives::LinesAdjacency: + LLPC_OUTS("LinesAdjacency\n"); + break; + case InputPrimitives::Triangles: + LLPC_OUTS("Triangles\n"); + break; + case InputPrimitives::TrianglesAdjacency: + LLPC_OUTS("TrianglesAdjacency\n"); + break; + case InputPrimitives::Patch: + LLPC_OUTS("Patch (ControlPoints = " << geometryMode.controlPoints << ")\n"); + break; + default: + break; + } + LLPC_OUTS("OutputPrimitive = "); + switch (geometryMode.outputPrimitive) { + case OutputPrimitives::Points: + LLPC_OUTS("Points\n"); + break; + case OutputPrimitives::LineStrip: + LLPC_OUTS("LineStrip\n"); + break; + case OutputPrimitives::TriangleStrip: + LLPC_OUTS("TriangleStrip\n"); + break; + default: + break; + } + LLPC_OUTS("Invocations = " << geometryMode.invocations << "\n"); + LLPC_OUTS("MaxOutputVertices = " << geometryMode.outputVertices << "\n"); + LLPC_OUTS("RobustGsEmits = " << (geometryMode.robustGsEmits ? "true" : "false") << "\n"); + LLPC_OUTS("\n"); + + const unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream; + LLPC_OUTS("RasterStream = "); + if (rasterStream != InvalidValue) + LLPC_OUTS("Stream[" << rasterStream << "]\n"); + else + LLPC_OUTS("NoRasterization\n"); + + const auto &streamXfbBuffers = m_pipelineState->getStreamXfbBuffers(); for (unsigned i = 0; i < MaxGsStreams; ++i) { unsigned streamItemSize = gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] * geometryMode.outputVertices; - LLPC_OUTS(" stream[" << i << "] = " << streamItemSize); - + LLPC_OUTS("Stream[" << i << "] = " << streamItemSize << " dwords"); + if (streamItemSize == 0) + LLPC_OUTS(" (Inactive)"); + LLPC_OUTS(" => "); if (m_pipelineState->enableXfb()) { - const auto &streamXfbBuffers = m_pipelineState->getStreamXfbBuffers(); - LLPC_OUTS(", XFB buffers = { "); if (streamXfbBuffers[i] != 0) { + LLPC_OUTS("XfbBuffer["); + bool printFirstXfbBuffer = true; for (unsigned j = 0; j < MaxTransformFeedbackBuffers; ++j) { - if ((streamXfbBuffers[i] & (1 << j)) != 0) - LLPC_OUTS(j << " "); + if ((streamXfbBuffers[i] & (1 << j)) != 0) { + LLPC_OUTS((printFirstXfbBuffer ? "" : ", ") << j << ""); + printFirstXfbBuffer = false; + } } + LLPC_OUTS("]"); + } else { + LLPC_OUTS("NoXfb"); } - LLPC_OUTS("}"); + } else { + LLPC_OUTS("NoXfb"); } - LLPC_OUTS("\n"); } LLPC_OUTS("\n"); } - if (gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize > 0) { - LLPC_OUTS("Ray query LDS stack size (in dwords): " - << gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize - << " (start = " << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << ")\n\n"); - } - - if (meshPipeline) { - LLPC_OUTS("GS primitive amplification factor: " << gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor << "\n"); - LLPC_OUTS("\n"); - LLPC_OUTS("GS is on-chip (Mesh)\n"); - } else if (m_pipelineState->getNggControl()->enableNgg) { - LLPC_OUTS("GS primitive amplifier: " << gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor << "\n"); - LLPC_OUTS("GS enable max output vertices: " - << (gsResUsage->inOutUsage.gs.calcFactor.enableMaxVertOut ? "true" : "false") << "\n"); - LLPC_OUTS("\n"); - LLPC_OUTS("GS is on-chip (NGG)\n"); - } else { - LLPC_OUTS("GS is " << (gsOnChip ? "on-chip" : "off-chip") << "\n"); - } - LLPC_OUTS("\n"); - return gsOnChip; } // ===================================================================================================================== // Process a single shader. void PatchResourceCollect::processShader() { - m_resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage); + m_resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value()); // Invoke handling of "call" instruction visit(m_entryPoint); @@ -1081,7 +1150,7 @@ void PatchResourceCollect::processMissingFs() { assert(m_shaderStage == ShaderStage::Fragment); if (!m_processMissingFs) return; - m_resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage); + m_resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value()); FsInputMappings fsInputMappings = {}; m_pipelineState->getPalMetadata()->retrieveFragmentInputInfo(fsInputMappings); @@ -1226,7 +1295,7 @@ void PatchResourceCollect::visitCallInst(CallInst &callInst) { // Collect transform feedback export calls, used in SW-emulated stream-out. For GS, the collecting will // be done when we generate copy shader since GS is primitive-based. if (m_shaderStage != ShaderStage::Geometry) { - auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; + auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage; // A transform feedback export call is expected to be <4 x dword> at most inOutUsage.xfbExpCount += outputValue->getType()->getPrimitiveSizeInBits() > 128 ? 2 : 1; } @@ -1506,11 +1575,11 @@ void PatchResourceCollect::matchGenericInOut() { assert(m_pipelineState->isGraphics()); // Do input matching and location remapping - bool packInput = m_pipelineState->canPackInput(m_shaderStage); + bool packInput = m_pipelineState->canPackInput(m_shaderStage.value()); if (m_shaderStage == ShaderStage::TessControl && m_tcsInputHasDynamicIndexing) { packInput = false; // Disable to pack VS-TCS - m_pipelineState->setPackInput(m_shaderStage, false); + m_pipelineState->setPackInput(m_shaderStage.value(), false); m_pipelineState->setPackOutput(ShaderStage::Vertex, false); } if (packInput) @@ -1519,7 +1588,7 @@ void PatchResourceCollect::matchGenericInOut() { updateInputLocInfoMapWithUnpack(); // Do output matching and location remapping - bool packOutput = m_pipelineState->canPackOutput(m_shaderStage); + bool packOutput = m_pipelineState->canPackOutput(m_shaderStage.value()); if (m_shaderStage == ShaderStage::Vertex && m_tcsInputHasDynamicIndexing) assert(!packOutput); if (packOutput) { @@ -1535,8 +1604,9 @@ void PatchResourceCollect::matchGenericInOut() { // Update location count of input/output LLPC_OUTS("===============================================================================\n"); - LLPC_OUTS("// LLPC location input/output mapping results (" << getShaderStageAbbreviation(m_shaderStage) << ")\n\n"); - auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; + LLPC_OUTS("// LLPC location input/output mapping results (" << getShaderStageAbbreviation(m_shaderStage.value()) + << ")\n\n"); + auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage; auto &inLocInfoMap = inOutUsage.inputLocInfoMap; auto &outLocInfoMap = inOutUsage.outputLocInfoMap; auto &perPatchInLocMap = inOutUsage.perPatchInputLocMap; @@ -1553,8 +1623,8 @@ void PatchResourceCollect::matchGenericInOut() { const unsigned newComp = locInfoPair.second.getComponent(); assert(newLoc != InvalidValue); inOutUsage.inputMapLocCount = std::max(inOutUsage.inputMapLocCount, newLoc + 1); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input: [location, component] = [" << origLoc - << ", " << origComp << "] => Mapped = [" << newLoc << ", " << newComp << "]\n"); + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input: [location, component] = [" + << origLoc << ", " << origComp << "] => Mapped = [" << newLoc << ", " << newComp << "]\n"); } LLPC_OUTS("\n"); } @@ -1576,13 +1646,13 @@ void PatchResourceCollect::matchGenericInOut() { inOutUsage.gs.outLocCount[2] + inOutUsage.gs.outLocCount[3]; inOutUsage.outputMapLocCount = std::max(inOutUsage.outputMapLocCount, assignedLocCount); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: stream = " << streamId << ", " + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output: stream = " << streamId << ", " << " [location, component] = [" << origLoc << ", " << origComp << "] => Mapped = [" << newLoc << ", " << newComp << "]\n"); } else { inOutUsage.outputMapLocCount = std::max(inOutUsage.outputMapLocCount, newLoc + 1); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: [location, component] = [" << origLoc - << ", " << origComp << "] => Mapped = [" << newLoc << ", " << newComp << "]\n"); + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output: [location, component] = [" + << origLoc << ", " << origComp << "] => Mapped = [" << newLoc << ", " << newComp << "]\n"); } } LLPC_OUTS("\n"); @@ -1593,8 +1663,8 @@ void PatchResourceCollect::matchGenericInOut() { for (auto locMap : perPatchInLocMap) { assert(m_shaderStage == ShaderStage::TessEval && locMap.second != InvalidValue); inOutUsage.perPatchInputMapLocCount = std::max(inOutUsage.perPatchInputMapLocCount, locMap.second + 1); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input (per-patch): location = " << locMap.first - << " => Mapped = " << locMap.second << "\n"); + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-patch): location = " + << locMap.first << " => Mapped = " << locMap.second << "\n"); } LLPC_OUTS("\n"); } @@ -1604,8 +1674,8 @@ void PatchResourceCollect::matchGenericInOut() { for (auto locMap : perPatchOutLocMap) { assert(m_shaderStage == ShaderStage::TessControl && locMap.second != InvalidValue); inOutUsage.perPatchOutputMapLocCount = std::max(inOutUsage.perPatchOutputMapLocCount, locMap.second + 1); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output (per-patch): location = " << locMap.first - << " => Mapped = " << locMap.second << "\n"); + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-patch): location = " + << locMap.first << " => Mapped = " << locMap.second << "\n"); } LLPC_OUTS("\n"); } @@ -1615,7 +1685,7 @@ void PatchResourceCollect::matchGenericInOut() { for (auto locMap : perPrimitiveInLocMap) { assert(m_shaderStage == ShaderStage::Fragment && locMap.second != InvalidValue); inOutUsage.perPrimitiveInputMapLocCount = std::max(inOutUsage.perPrimitiveInputMapLocCount, locMap.second + 1); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input (per-primitive): location = " + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-primitive): location = " << locMap.first << " => Mapped = " << locMap.second << "\n"); } LLPC_OUTS("\n"); @@ -1626,31 +1696,31 @@ void PatchResourceCollect::matchGenericInOut() { for (auto locMap : perPrimitiveOutLocMap) { assert(m_shaderStage == ShaderStage::Mesh && locMap.second != InvalidValue); inOutUsage.perPrimitiveOutputMapLocCount = std::max(inOutUsage.perPrimitiveOutputMapLocCount, locMap.second + 1); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output (per-primitive): location = " + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-primitive): location = " << locMap.first << " => Mapped = " << locMap.second << "\n"); } LLPC_OUTS("\n"); } LLPC_OUTS("// LLPC location count results (after input/output matching) \n\n"); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input: locations = " << inOutUsage.inputMapLocCount - << "\n"); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: locations = " << inOutUsage.outputMapLocCount - << "\n"); + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) + << ") Input: locations = " << inOutUsage.inputMapLocCount << "\n"); + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) + << ") Output: locations = " << inOutUsage.outputMapLocCount << "\n"); if (m_shaderStage == ShaderStage::TessEval) { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-patch): locations = " << inOutUsage.perPatchInputMapLocCount << "\n"); } if (m_shaderStage == ShaderStage::TessControl) { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-patch): locations = " << inOutUsage.perPatchOutputMapLocCount << "\n"); } if (m_shaderStage == ShaderStage::Fragment) { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-primitive): locations = " << inOutUsage.perPrimitiveInputMapLocCount << "\n"); } if (m_shaderStage == ShaderStage::Mesh) { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-primitive): locations = " << inOutUsage.perPrimitiveOutputMapLocCount << "\n"); } LLPC_OUTS("\n"); @@ -1663,12 +1733,12 @@ void PatchResourceCollect::matchGenericInOut() { void PatchResourceCollect::mapBuiltInToGenericInOut() { assert(m_pipelineState->isGraphics()); - const auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage); + const auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value()); auto &builtInUsage = resUsage->builtInUsage; auto &inOutUsage = resUsage->inOutUsage; - const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage); + const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value()); auto nextResUsage = nextStage ? m_pipelineState->getShaderResourceUsage(nextStage.value()) : nullptr; assert(inOutUsage.builtInInputLocMap.empty()); // Should be empty @@ -2060,7 +2130,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() { // NOTE: If gl_in[].gl_ClipDistance is used, we have to check the usage of gl_out[].gl_ClipDistance in // tessellation control shader. The clip distance is the maximum of the two. We do this to avoid // incorrectness of location assignment during builtin-to-generic mapping. - const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage); + const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage.value()); if (prevStage == ShaderStage::TessControl) { const auto &prevBuiltInUsage = m_pipelineState->getShaderResourceUsage(prevStage.value())->builtInUsage.tcs; clipDistanceCount = std::max(clipDistanceCount, prevBuiltInUsage.clipDistance); @@ -2074,7 +2144,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() { if (builtInUsage.tes.cullDistanceIn > 0) { unsigned cullDistanceCount = builtInUsage.tes.cullDistanceIn; - const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage); + const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage.value()); if (prevStage == ShaderStage::TessControl) { const auto &prevBuiltInUsage = m_pipelineState->getShaderResourceUsage(prevStage.value())->builtInUsage.tcs; cullDistanceCount = std::max(cullDistanceCount, prevBuiltInUsage.clipDistance); @@ -2411,7 +2481,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() { std::max(inOutUsage.perPrimitiveOutputMapLocCount, availPerPrimitiveOutMapLoc); } else if (m_shaderStage == ShaderStage::Fragment) { // FS - const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage); + const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage.value()); unsigned availInMapLoc = inOutUsage.inputMapLocCount; unsigned availPerPrimitiveInMapLoc = inOutUsage.perPrimitiveInputMapLocCount; @@ -2467,11 +2537,12 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() { // Do builtin-to-generic mapping LLPC_OUTS("===============================================================================\n"); - LLPC_OUTS("// LLPC builtin-to-generic mapping results (" << getShaderStageAbbreviation(m_shaderStage) << ")\n\n"); + LLPC_OUTS("// LLPC builtin-to-generic mapping results (" << getShaderStageAbbreviation(m_shaderStage.value()) + << ")\n\n"); for (const auto &builtInMap : inOutUsage.builtInInputLocMap) { const BuiltInKind builtInId = static_cast(builtInMap.first); const unsigned loc = builtInMap.second; - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input: builtin = " + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input: builtin = " << PipelineState::getBuiltInName(builtInId) << " => Mapped = " << loc << "\n"); } if (!inOutUsage.builtInInputLocMap.empty()) @@ -2482,11 +2553,11 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() { const unsigned loc = builtInMap.second; if (m_shaderStage == ShaderStage::Geometry) { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output: stream = " << m_pipelineState->getRasterizerState().rasterStream << " , " << "builtin = " << PipelineState::getBuiltInName(builtInId) << " => Mapped = " << loc << "\n"); } else { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: builtin = " + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output: builtin = " << PipelineState::getBuiltInName(builtInId) << " => Mapped = " << loc << "\n"); } } @@ -2496,7 +2567,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() { for (const auto &builtInMap : inOutUsage.perPatchBuiltInInputLocMap) { const BuiltInKind builtInId = static_cast(builtInMap.first); const unsigned loc = builtInMap.second; - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input (per-patch): builtin = " + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-patch): builtin = " << PipelineState::getBuiltInName(builtInId) << " => Mapped = " << loc << "\n"); } if (!inOutUsage.perPatchBuiltInInputLocMap.empty()) @@ -2505,7 +2576,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() { for (const auto &builtInMap : inOutUsage.perPatchBuiltInOutputLocMap) { const BuiltInKind builtInId = static_cast(builtInMap.first); const unsigned loc = builtInMap.second; - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output (per-patch): builtin = " + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-patch): builtin = " << PipelineState::getBuiltInName(builtInId) << " => Mapped = " << loc << "\n"); } if (!inOutUsage.perPatchBuiltInOutputLocMap.empty()) @@ -2514,7 +2585,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() { for (const auto &builtInMap : inOutUsage.perPrimitiveBuiltInInputLocMap) { const BuiltInKind builtInId = static_cast(builtInMap.first); const unsigned loc = builtInMap.second; - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input (per-primitive): builtin = " + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-primitive): builtin = " << PipelineState::getBuiltInName(builtInId) << " => Mapped = " << loc << "\n"); } if (!inOutUsage.perPrimitiveBuiltInInputLocMap.empty()) @@ -2523,31 +2594,31 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() { for (const auto &builtInMap : inOutUsage.perPrimitiveBuiltInOutputLocMap) { const BuiltInKind builtInId = static_cast(builtInMap.first); const unsigned loc = builtInMap.second; - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output (per-primitive): builtin = " + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-primitive): builtin = " << PipelineState::getBuiltInName(builtInId) << " => Mapped = " << loc << "\n"); } if (!inOutUsage.perPrimitiveBuiltInOutputLocMap.empty()) LLPC_OUTS("\n"); LLPC_OUTS("// LLPC location count results (after builtin-to-generic mapping)\n\n"); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input: locations = " << inOutUsage.inputMapLocCount - << "\n"); - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: locations = " << inOutUsage.outputMapLocCount - << "\n"); + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) + << ") Input: locations = " << inOutUsage.inputMapLocCount << "\n"); + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) + << ") Output: locations = " << inOutUsage.outputMapLocCount << "\n"); if (m_shaderStage == ShaderStage::TessEval) { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-patch): locations = " << inOutUsage.perPatchInputMapLocCount << "\n"); } if (m_shaderStage == ShaderStage::TessControl) { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-patch): locations = " << inOutUsage.perPatchOutputMapLocCount << "\n"); } if (m_shaderStage == ShaderStage::Fragment) { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-primitive): locations = " << inOutUsage.perPrimitiveInputMapLocCount << "\n"); } if (m_shaderStage == ShaderStage::Mesh) { - LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) + LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-primitive): locations = " << inOutUsage.perPrimitiveOutputMapLocCount << "\n"); } LLPC_OUTS("\n"); @@ -2578,7 +2649,7 @@ void PatchResourceCollect::mapGsBuiltInOutput(unsigned builtInId, unsigned elemC // ===================================================================================================================== // Update the inputLocInfoutputoMap, perPatchInputLocMap and perPrimitiveInputLocMap void PatchResourceCollect::updateInputLocInfoMapWithUnpack() { - auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; + auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage; auto &inputLocInfoMap = inOutUsage.inputLocInfoMap; // Remove unused locationInfo bool eraseUnusedLocInfo = !m_pipelineState->isUnlinked(); // Should be whole pipeline compilation @@ -2656,7 +2727,7 @@ void PatchResourceCollect::updateInputLocInfoMapWithUnpack() { // corresponding input location in the next stage. For example, if TCS output has dynamic location indexing from // [0,2], we need add the corresponding location info to TES input map. Otherwise, it will cause mismatch when the // dynamic indexing is in a loop and TES only uses location 1. - auto preStage = m_pipelineState->getPrevShaderStage(m_shaderStage); + auto preStage = m_pipelineState->getPrevShaderStage(m_shaderStage.value()); if (preStage == ShaderStage::TessControl || preStage == ShaderStage::Mesh) { if (!inputLocInfoMap.empty()) { auto &outputLocInfoMap = m_pipelineState->getShaderResourceUsage(preStage.value())->inOutUsage.outputLocInfoMap; @@ -2734,8 +2805,8 @@ void PatchResourceCollect::updateInputLocInfoMapWithUnpack() { // ===================================================================================================================== // Clear unused output from outputLocInfoMap, perPatchOutputLocMap, and perPrimitiveOutputLocMap void PatchResourceCollect::clearUnusedOutput() { - auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage); - auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; + auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value()); + auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage; auto &outputLocInfoMap = inOutUsage.outputLocInfoMap; if (nextStage) { // Collect the locations of TCS's imported outputs @@ -2878,8 +2949,8 @@ void PatchResourceCollect::clearUnusedOutput() { void PatchResourceCollect::updateOutputLocInfoMapWithUnpack() { clearUnusedOutput(); - const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage); - auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; + const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value()); + auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage; // // Update per-vertex output location info @@ -3133,7 +3204,7 @@ bool PatchResourceCollect::canChangeOutputLocationsForGs() { // ===================================================================================================================== // Update inputLocInfoMap based on {TCS, GS, FS} input import calls void PatchResourceCollect::updateInputLocInfoMapWithPack() { - auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; + auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage; auto &inputLocInfoMap = inOutUsage.inputLocInfoMap; inputLocInfoMap.clear(); @@ -3153,7 +3224,7 @@ void PatchResourceCollect::updateInputLocInfoMapWithPack() { bool isFsAndHasGs = (isFs && (m_pipelineState->hasShaderStage(ShaderStage::Geometry) || partPipelineHasGs)); bool requireDword = isTcs || isGs || isFsAndHasGs; // Create locationMap - m_locationInfoMapManager->createMap(m_inputCalls, m_shaderStage, requireDword); + m_locationInfoMapManager->createMap(m_inputCalls, m_shaderStage.value(), requireDword); // Fill inputLocInfoMap of {TCS, GS, FS} for the packable calls unsigned newLocIdx = 0; @@ -3176,112 +3247,128 @@ void PatchResourceCollect::updateInputLocInfoMapWithPack() { // ===================================================================================================================== // Update outputLocInfoMap based on inputLocInfoMap of next stage or GS output export calls for copy shader void PatchResourceCollect::updateOutputLocInfoMapWithPack() { - auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; + auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage; auto &outputLocInfoMap = inOutUsage.outputLocInfoMap; - outputLocInfoMap.clear(); + outputLocInfoMap.clear(); // Clear it, will reconstruct if (m_outputCalls.empty()) return; assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval || - m_shaderStage == ShaderStage::Geometry); - auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage); + m_shaderStage == ShaderStage::Geometry); // Possible stages + auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value()); auto &nextStageInputLocInfoMap = m_pipelineState->getShaderResourceUsage(nextStage.value())->inOutUsage.inputLocInfoMap; // Remove unused outputs and update the output map if (m_shaderStage != m_pipelineState->getLastVertexProcessingStage()) { - // For VS-{TCS, GS}, the dead output has no matching input of the next stage + // Not last vertex processing stage, collect dead outputs that have no matching inputs of the next stage. + + // Collect dead output calls. for (auto call : m_outputCalls) { InOutLocationInfo origLocInfo; origLocInfo.setLocation(cast(call->getOperand(0))->getZExtValue()); origLocInfo.setComponent(cast(call->getOperand(1))->getZExtValue()); - if (nextStageInputLocInfoMap.find(origLocInfo) == nextStageInputLocInfoMap.end()) + if (nextStageInputLocInfoMap.count(origLocInfo) == 0) m_deadCalls.push_back(call); } - // The output map should be equal to the input map of the next stage + + // Use the input map of the next stage to update the output map of current stage. outputLocInfoMap = nextStageInputLocInfoMap; } else { - // For {VS, TES, GS}-FS, the dead output is neither a XFB output or a corresponding FS' input. + // Last vertex processing stage, collect dead outputs that are not XFB output or have no matching FS inputs. assert(nextStage == ShaderStage::Fragment); - // Collect XFB locations - auto &xfbOutLocInfoMap = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage.locInfoXfbOutInfoMap; - std::set xfbOutputLocs[MaxGsStreams]; - for (const auto &locInfoPair : xfbOutLocInfoMap) { - const auto &locInfo = locInfoPair.first; - xfbOutputLocs[locInfo.getStreamId()].insert(locInfo.getLocation()); + const unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream; + + // Collect XFB output location pair . + SmallSet, MaxInOutLocCount> xfbOutputLocPairs[MaxGsStreams]; + auto &xfbOutInfoMap = + m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage.locInfoXfbOutInfoMap; + for (const auto &xfbOutLocInfo : xfbOutInfoMap) { + const auto &locInfo = xfbOutLocInfo.first; + if (locInfo.isBuiltIn()) + continue; // Skip built-in outputs + xfbOutputLocPairs[locInfo.getStreamId()].insert(std::make_pair(locInfo.getLocation(), locInfo.getComponent())); } - // Store the output calls that have no corresponding input in FS + // Collect the output calls that have no corresponding FS inputs. std::vector noMappedCalls; for (auto call : m_outputCalls) { - // NOTE: Don't set stream ID to the original output location info for GS. This is because the corresponding input - // location info of FS doesn't have stream ID. This will cause in-out mismatch. - InOutLocationInfo origLocInfo; - origLocInfo.setLocation(cast(call->getOperand(0))->getZExtValue()); - origLocInfo.setComponent(cast(call->getOperand(1))->getZExtValue()); - - const unsigned origLocation = origLocInfo.getLocation(); - const bool hasNoMappedInput = (nextStageInputLocInfoMap.find(origLocInfo) == nextStageInputLocInfoMap.end()); - if (hasNoMappedInput) { - const unsigned streamId = - m_shaderStage == ShaderStage::Geometry ? cast(call->getOperand(2))->getZExtValue() : 0; - - if (xfbOutputLocs[streamId].count(origLocation) == 0) + const unsigned location = cast(call->getOperand(0))->getZExtValue(); + const unsigned component = cast(call->getOperand(1))->getZExtValue(); + const unsigned streamId = + m_shaderStage == ShaderStage::Geometry ? cast(call->getOperand(2))->getZExtValue() : 0; + + bool noMappedInput = true; + if (streamId == rasterStream) { + // Skip checking FS inputs if this output doesn't belong to rasterization stream. + InOutLocationInfo origLocInfo; + origLocInfo.setLocation(location); + origLocInfo.setComponent(component); + // NOTE: Don't set stream ID to the original output location info for GS. This is because the corresponding + // input location info of FS doesn't have stream ID. This will cause in-out mismatch. + noMappedInput = nextStageInputLocInfoMap.count(origLocInfo) == 0; + } + + if (noMappedInput) { + if (xfbOutputLocPairs[streamId].count(std::make_pair(location, component)) == 0) m_deadCalls.push_back(call); else noMappedCalls.push_back(call); } } - // The output map of current stage contains at most two parts: the first part is consistent with FS input map and - // the second part is built from the no mapped calls. - std::vector outLocInfos; + + // The output map of current stage consists of two parts: the first part is consistent with FS input map and + // the second part is from the no mapped calls. + std::vector noMappedOutputLocInfos; for (auto call : noMappedCalls) { InOutLocationInfo origLocInfo; origLocInfo.setLocation(cast(call->getOperand(0))->getZExtValue()); origLocInfo.setComponent(cast(call->getOperand(1))->getZExtValue()); if (m_shaderStage == ShaderStage::Geometry) origLocInfo.setStreamId(cast(call->getOperand(2))->getZExtValue()); - outLocInfos.push_back(origLocInfo); + noMappedOutputLocInfos.push_back(origLocInfo); } - m_locationInfoMapManager->createMap(outLocInfos, m_shaderStage); - const auto &calcOutLocInfoMap = m_locationInfoMapManager->getMap(); + m_locationInfoMapManager->createMap(noMappedOutputLocInfos, m_shaderStage.value()); + const auto &noMappedOutputLocInfoMap = m_locationInfoMapManager->getMap(); + // Reconstruct the first part of output map by using FS input map. if (m_shaderStage == ShaderStage::Geometry) { - // NOTE: The output location info from next shader stage (FS) doesn't contain raster stream ID. We have to - // reconstruct it. - const auto rasterStream = m_pipelineState->getRasterizerState().rasterStream; - for (auto &entry : nextStageInputLocInfoMap) { - InOutLocationInfo origLocInfo(entry.first); - origLocInfo.setStreamId(rasterStream); - InOutLocationInfo newLocInfo(entry.second); - newLocInfo.setStreamId(rasterStream); - outputLocInfoMap.insert({origLocInfo, newLocInfo}); + if (rasterStream != InvalidValue) { + // NOTE: The output location info from FS doesn't contain rasterization stream ID. We have to reconstruct it. + for (auto &locInfo : nextStageInputLocInfoMap) { + InOutLocationInfo origLocInfo(locInfo.first); + origLocInfo.setStreamId(rasterStream); + InOutLocationInfo newLocInfo(locInfo.second); + newLocInfo.setStreamId(rasterStream); + outputLocInfoMap.insert({origLocInfo, newLocInfo}); + } } } else { outputLocInfoMap = nextStageInputLocInfoMap; } - unsigned newLocMax = 0; - for (const auto &entry : outputLocInfoMap) - newLocMax = std::max(newLocMax, entry.second.getLocation() + 1); - // Update output map - for (const auto &entry : calcOutLocInfoMap) { - InOutLocationInfo origLocInfo; - origLocInfo.setStreamId(entry.first.getStreamId()); - origLocInfo.setLocation(entry.first.getLocation()); - origLocInfo.setComponent(entry.first.getComponent()); - InOutLocationInfo newLocInfo(entry.second); - newLocInfo.setLocation(newLocInfo.getLocation() + newLocMax); + // Reconstruct the second part of output map by visiting each call of XFB output. + unsigned maxMappedLoc[MaxGsStreams] = {}; + for (const auto &locInfo : outputLocInfoMap) { + maxMappedLoc[locInfo.first.getStreamId()] = + std::max(maxMappedLoc[locInfo.first.getStreamId()], locInfo.second.getLocation() + 1); + } + + // Update output map for those XFB outputs. + for (const auto &locInfo : noMappedOutputLocInfoMap) { + InOutLocationInfo origLocInfo(locInfo.first); + InOutLocationInfo newLocInfo(locInfo.second); + newLocInfo.setLocation(newLocInfo.getLocation() + maxMappedLoc[locInfo.first.getStreamId()]); outputLocInfoMap.insert({origLocInfo, newLocInfo}); } - // update output count per stream for GS + // Update output count per stream for GS if (m_shaderStage == ShaderStage::Geometry) { - for (auto &locInfoPair : outputLocInfoMap) { - auto &outLocCount = inOutUsage.gs.outLocCount[locInfoPair.first.getStreamId()]; - outLocCount = std::max(outLocCount, locInfoPair.second.getLocation() + 1); + for (auto &locInfo : outputLocInfoMap) { + auto &outLocCount = inOutUsage.gs.outLocCount[locInfo.first.getStreamId()]; + outLocCount = std::max(outLocCount, locInfo.second.getLocation() + 1); } } } @@ -3292,7 +3379,7 @@ void PatchResourceCollect::updateOutputLocInfoMapWithPack() { void PatchResourceCollect::reassembleOutputExportCalls() { if (m_outputCalls.empty()) return; - assert(m_pipelineState->canPackOutput(m_shaderStage)); + assert(m_pipelineState->canPackOutput(m_shaderStage.value())); BuilderBase builder(*m_context); builder.SetInsertPoint(m_outputCalls.back()); @@ -3317,7 +3404,7 @@ void PatchResourceCollect::reassembleOutputExportCalls() { }; // Collect ElementsInfo in each packed location - auto &outputLocInfoMap = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage.outputLocInfoMap; + auto &outputLocInfoMap = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage.outputLocInfoMap; std::vector elementsInfoArray(outputLocInfoMap.size()); for (auto call : m_outputCalls) { @@ -3746,7 +3833,7 @@ void PatchResourceCollect::clearUndefinedOutput() { for (auto call : candidateCalls) { // For unlinked case, we should keep the location info map unchanged. - if (m_pipelineState->getNextShaderStage(m_shaderStage)) { + if (m_pipelineState->getNextShaderStage(m_shaderStage.value())) { // Remove the output location info if it exists unsigned index = m_shaderStage == ShaderStage::Mesh ? 2 : 1; unsigned component = cast(call->getArgOperand(index))->getZExtValue(); diff --git a/lgc/patch/RegisterMetadataBuilder.cpp b/lgc/patch/RegisterMetadataBuilder.cpp index 476a641d93..c982836fc9 100644 --- a/lgc/patch/RegisterMetadataBuilder.cpp +++ b/lgc/patch/RegisterMetadataBuilder.cpp @@ -684,9 +684,16 @@ void RegisterMetadataBuilder::buildHwVsRegisters() { vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::Streamout_1En] = enablePrimStats || streamXfbBuffers[1] > 0; vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::Streamout_2En] = enablePrimStats || streamXfbBuffers[2] > 0; vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::Streamout_3En] = enablePrimStats || streamXfbBuffers[3] > 0; - if (shaderStage == ShaderStage::CopyShader) - vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::RastStream] = - m_pipelineState->getRasterizerState().rasterStream; + if (shaderStage == ShaderStage::CopyShader) { + unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream; + if (m_pipelineState->getRasterizerState().rasterStream == InvalidValue) { + // NOTE: According to HW register spec, rasterization stream has 3 bits, the lower 2 bits are programmed to stream + // ID (0~3). If rasterization is not enabled for any stream, set the highest 1 bit to 1. + static const unsigned NoRasterStream = 0x4; + rasterStream = NoRasterStream; + } + vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::RastStream] = rasterStream; + } // Set some field of SPI_SHADER_PGM_RSRC2_VS getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VsStreamoutEn] = enableXfb; diff --git a/lgc/patch/PatchLoadScalarizer.cpp b/lgc/patch/ScalarizeLoads.cpp similarity index 98% rename from lgc/patch/PatchLoadScalarizer.cpp rename to lgc/patch/ScalarizeLoads.cpp index a551bff048..964e0a9636 100644 --- a/lgc/patch/PatchLoadScalarizer.cpp +++ b/lgc/patch/ScalarizeLoads.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file PatchLoadScalarizer.cpp + * @file ScalarizeLoads.cpp * @brief LLPC source file: contains implementation of class lgc::PatchLoadScalarizer. *********************************************************************************************************************** */ -#include "lgc/patch/PatchLoadScalarizer.h" +#include "lgc/patch/ScalarizeLoads.h" #include "lgc/state/PipelineShaders.h" #include "lgc/state/PipelineState.h" #include "llvm/IR/Constants.h" diff --git a/lgc/patch/ShaderInputs.cpp b/lgc/patch/ShaderInputs.cpp index 0c6a1253a3..fd8b6c4a02 100644 --- a/lgc/patch/ShaderInputs.cpp +++ b/lgc/patch/ShaderInputs.cpp @@ -308,7 +308,7 @@ const char *ShaderInputs::getInputName(ShaderInput inputKind) { } // ===================================================================================================================== -// Gather usage of shader inputs from before PatchEntryPointMutate +// Gather usage of shader inputs from before MutateEntryPoint // // @param module : IR module void ShaderInputs::gatherUsage(Module &module) { @@ -397,7 +397,7 @@ void ShaderInputs::fixupUses(Module &module, PipelineState *pipelineState, bool // The new ShaderInputs scheme means that InOutBuilder or PatchResourceCollect no longer needs to set // the builtInUsage field for an input that is generated using ShaderInputs::getInput() and/or - // ShaderInputs::getSpecialUserData() (before PatchEntryPointMutate), and we can remove that + // ShaderInputs::getSpecialUserData() (before MutateEntryPoint), and we can remove that // builtInUsage field. // // However, in some cases, the builtInUsage field is used in NggPrimShader and/or Gfx*ConfigBuilder @@ -735,7 +735,6 @@ uint64_t ShaderInputs::getShaderArgTys(PipelineState *pipelineState, ShaderStage // // @param stage : Shader stage ShaderInputs::ShaderInputsUsage *ShaderInputs::getShaderInputsUsage(ShaderStageEnum stage) { - m_shaderInputsUsage.resize(std::max(m_shaderInputsUsage.size(), static_cast(stage) + 1)); return &m_shaderInputsUsage[stage]; } diff --git a/lgc/state/PassManagerCache.cpp b/lgc/state/PassManagerCache.cpp index 32f8f8425c..25a6325651 100644 --- a/lgc/state/PassManagerCache.cpp +++ b/lgc/state/PassManagerCache.cpp @@ -30,7 +30,7 @@ */ #include "lgc/state/PassManagerCache.h" #include "lgc/LgcContext.h" -#include "lgc/patch/PatchLlvmIrInclusion.h" +#include "lgc/patch/IncludeLlvmIr.h" #include "lgc/patch/PatchSetupTargetFeatures.h" #include "llvm/Analysis/TargetTransformInfo.h" #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 442438 diff --git a/lgc/state/PipelineShaders.cpp b/lgc/state/PipelineShaders.cpp index c14b41e61e..6ae6f579c3 100644 --- a/lgc/state/PipelineShaders.cpp +++ b/lgc/state/PipelineShaders.cpp @@ -44,8 +44,6 @@ AnalysisKey PipelineShaders::Key; // ===================================================================================================================== PipelineShadersResult::PipelineShadersResult() { - for (auto &entryPoint : m_entryPoints) - entryPoint = nullptr; } // ===================================================================================================================== @@ -77,8 +75,7 @@ PipelineShadersResult PipelineShaders::run(Module &module, ModuleAnalysisManager // // @param shaderStage : Shader stage Function *PipelineShadersResult::getEntryPoint(ShaderStageEnum shaderStage) const { - assert((unsigned)shaderStage < ShaderStage::CountInternal); - return m_entryPoints[shaderStage]; + return m_entryPoints.lookup(shaderStage); } // ===================================================================================================================== diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp index b7a85906ee..947559e43e 100644 --- a/lgc/state/PipelineState.cpp +++ b/lgc/state/PipelineState.cpp @@ -589,9 +589,7 @@ ShaderStageMask PipelineState::getShaderStageMask() { // ===================================================================================================================== // Check whether the pipeline is a graphics pipeline bool PipelineState::isGraphics() { - return getShaderStageMask().contains_any({ShaderStage::Task, ShaderStage::Vertex, ShaderStage::TessControl, - ShaderStage::TessEval, ShaderStage::Geometry, ShaderStage::Mesh, - ShaderStage::Fragment}); + return getShaderStageMask().contains_any(ShaderStagesGraphics); } // ===================================================================================================================== @@ -600,8 +598,6 @@ bool PipelineState::isGraphics() { // @param stage : Shader stage // @param options : Shader options void PipelineState::setShaderOptions(ShaderStageEnum stage, const ShaderOptions &options) { - if (m_shaderOptions.size() <= stage) - m_shaderOptions.resize(stage + 1); m_shaderOptions[stage] = options; } @@ -610,8 +606,6 @@ void PipelineState::setShaderOptions(ShaderStageEnum stage, const ShaderOptions // // @param stage : Shader stage const ShaderOptions &PipelineState::getShaderOptions(ShaderStageEnum stage) { - if (m_shaderOptions.size() <= stage) - m_shaderOptions.resize(stage + 1); return m_shaderOptions[stage]; } @@ -632,9 +626,12 @@ void PipelineState::recordOptions(Module *module) { if (unsigned preRasterHasGs = unsigned(m_preRasterHasGs)) setNamedMetadataToArrayOfInt32(module, preRasterHasGs, PreRasterHasGsMetadataName); setNamedMetadataToArrayOfInt32(module, m_options, OptionsMetadataName); - for (unsigned stage = 0; stage != m_shaderOptions.size(); ++stage) { - std::string metadataName = - (Twine(OptionsMetadataName) + "." + getShaderStageAbbreviation(static_cast(stage))).str(); + // Iterate stages in deterministic order + for (auto stage : ShaderStagesNative) { + if (!m_shaderOptions.contains(stage)) + continue; + + std::string metadataName = (Twine(OptionsMetadataName) + "." + getShaderStageAbbreviation(stage)).str(); setNamedMetadataToArrayOfInt32(module, m_shaderOptions[stage], metadataName); } } @@ -664,13 +661,12 @@ void PipelineState::readOptions(Module *module) { m_preRasterHasGs = preRasterHasGsAsInt; readNamedMetadataArrayOfInt32(module, OptionsMetadataName, m_options); - for (unsigned stage = 0; stage != ShaderStage::Compute + 1; ++stage) { + for (auto stage : ShaderStagesNative) { std::string metadataName = (Twine(OptionsMetadataName) + "." + getShaderStageAbbreviation(static_cast(stage))).str(); auto namedMetaNode = module->getNamedMetadata(metadataName); if (!namedMetaNode || namedMetaNode->getNumOperands() == 0) continue; - m_shaderOptions.resize(stage + 1); readArrayOfInt32MetaNode(namedMetaNode->getOperand(0), m_shaderOptions[stage]); } } @@ -769,7 +765,7 @@ void PipelineState::recordUserDataTable(ArrayRef nodes, NamedMDNod // Operand 1: matchType operands.push_back(ConstantAsMetadata::get(builder.getInt32(static_cast(node.abstractType)))); // Operand 2: visibility - operands.push_back(ConstantAsMetadata::get(builder.getInt32(node.visibility))); + operands.push_back(ConstantAsMetadata::get(builder.getInt32(node.visibility.toRaw()))); // Operand 3: offsetInDwords operands.push_back(ConstantAsMetadata::get(builder.getInt32(node.offsetInDwords))); // Operand 4: sizeInDwords @@ -840,7 +836,8 @@ void PipelineState::readUserDataNodes(Module *module) { nextNode->abstractType = static_cast(mdconst::extract(metadataNode->getOperand(1))->getZExtValue()); // Operand 2: visibility - nextNode->visibility = mdconst::extract(metadataNode->getOperand(2))->getZExtValue(); + nextNode->visibility = + ShaderStageMask::fromRaw(mdconst::extract(metadataNode->getOperand(2))->getZExtValue()); // Operand 3: offsetInDwords nextNode->offsetInDwords = mdconst::extract(metadataNode->getOperand(3))->getZExtValue(); // Operand 4: sizeInDwords @@ -900,18 +897,22 @@ void PipelineState::readUserDataNodes(Module *module) { // // @param stage : Shader stage to check against nodes' visibility field, or ShaderStage::Invalid for any const ResourceNode *PipelineState::findPushConstantResourceNode(std::optional stage) const { - unsigned visibilityMask = UINT_MAX; - if (stage) - visibilityMask = 1 << std::min(unsigned(stage.value()), unsigned(ShaderStage::Compute)); + ShaderStageMask visibilityMask(ShaderStages); + if (stage) { + ShaderStageEnum maskStage = stage.value(); + if (!ShaderStageMask(ShaderStagesNative).contains(maskStage)) + maskStage = ShaderStage::Compute; + visibilityMask = ShaderStageMask(maskStage); + } for (const ResourceNode &node : getUserDataNodes()) { - if (node.visibility != 0 && (node.visibility & visibilityMask) == 0) + if (!node.visibility.empty() && (node.visibility & visibilityMask).empty()) continue; if (node.concreteType == ResourceNodeType::PushConst) return &node; if (node.concreteType == ResourceNodeType::DescriptorTableVaPtr) { if (!node.innerTable.empty() && node.innerTable[0].concreteType == ResourceNodeType::PushConst) { - if (node.innerTable[0].visibility != 0 && (node.innerTable[0].visibility & visibilityMask) == 0) + if (!node.innerTable[0].visibility.empty() && (node.innerTable[0].visibility & visibilityMask).empty()) continue; assert(ResourceLayoutScheme::Indirect == m_options.resourceLayoutScheme); return &node; @@ -921,50 +922,6 @@ const ResourceNode *PipelineState::findPushConstantResourceNode(std::optional Unknown <--------------+--------------------+ -// -// @param nodeType : Resource node type -// @param candidateType : Resource node candidate type -static bool isNodeTypeCompatible(ResourceNodeType nodeType, ResourceNodeType candidateType) { - if (nodeType == ResourceNodeType::Unknown || candidateType == nodeType || - candidateType == ResourceNodeType::DescriptorMutable) - return true; - - if ((nodeType == ResourceNodeType::DescriptorConstBuffer || nodeType == DescriptorAnyBuffer) && - (candidateType == ResourceNodeType::DescriptorConstBufferCompact || - candidateType == ResourceNodeType::DescriptorConstBuffer || candidateType == ResourceNodeType::InlineBuffer)) - return true; - - if ((nodeType == ResourceNodeType::DescriptorBuffer || nodeType == DescriptorAnyBuffer) && - (candidateType == ResourceNodeType::DescriptorBufferCompact || - candidateType == ResourceNodeType::DescriptorBuffer)) - return true; - - if ((nodeType == ResourceNodeType::DescriptorResource || nodeType == ResourceNodeType::DescriptorTexelBuffer || - nodeType == ResourceNodeType::DescriptorSampler) && - candidateType == ResourceNodeType::DescriptorCombinedTexture) - return true; - - return false; -} - // ===================================================================================================================== // Returns true when type is one that has a binding. // @param nodeType : Resource node type @@ -998,12 +955,10 @@ static bool nodeTypeHasBinding(ResourceNodeType nodeType) { // sizeInDwords/stride. // // @param node : Node to try and match -// @param nodeType : Resource node type being searched for // @param descSet : Descriptor set being searched for // @param binding : Descriptor binding being searched for -bool PipelineState::matchResourceNode(const ResourceNode &node, ResourceNodeType nodeType, uint64_t descSet, - unsigned binding) const { - if (node.set != descSet || !isNodeTypeCompatible(nodeType, node.abstractType)) +bool PipelineState::matchResourceNode(const ResourceNode &node, uint64_t descSet, unsigned binding) const { + if (node.set != descSet) return false; if (node.binding == binding) return true; @@ -1032,14 +987,18 @@ bool PipelineState::matchResourceNode(const ResourceNode &node, ResourceNodeType std::pair PipelineState::findResourceNode(ResourceNodeType nodeType, uint64_t descSet, unsigned binding, std::optional stage) const { - unsigned visibilityMask = UINT_MAX; - if (stage) - visibilityMask = 1 << std::min(unsigned(stage.value()), unsigned(ShaderStage::Compute)); + ShaderStageMask visibilityMask(ShaderStages); + if (stage) { + ShaderStageEnum maskStage = stage.value(); + if (!ShaderStageMask(ShaderStagesNative).contains(maskStage)) + maskStage = ShaderStage::Compute; + visibilityMask = ShaderStageMask(maskStage); + } for (const ResourceNode &node : getUserDataNodes()) { if (!nodeTypeHasBinding(node.concreteType)) continue; - if (node.visibility != 0 && (node.visibility & visibilityMask) == 0) + if (!node.visibility.empty() && (node.visibility & visibilityMask).empty()) continue; if (node.concreteType == ResourceNodeType::DescriptorTableVaPtr) { @@ -1053,12 +1012,12 @@ PipelineState::findResourceNode(ResourceNodeType nodeType, uint64_t descSet, uns // Check inner nodes. for (const ResourceNode &innerNode : node.innerTable) { - if (innerNode.visibility != 0 && (innerNode.visibility & visibilityMask) == 0) + if (!innerNode.visibility.empty() && (innerNode.visibility & visibilityMask).empty()) continue; - if (matchResourceNode(innerNode, nodeType, descSet, binding)) + if (matchResourceNode(innerNode, descSet, binding)) return {&node, &innerNode}; } - } else if (matchResourceNode(node, nodeType, descSet, binding)) + } else if (matchResourceNode(node, descSet, binding)) return {&node, &node}; } @@ -1081,13 +1040,18 @@ PipelineState::findResourceNode(ResourceNodeType nodeType, uint64_t descSet, uns // // @param nodeType : Type of the resource mapping node // @param stage : Shader stage to check against nodes' visibility field, or ShaderStage::Invalid for any -const ResourceNode *PipelineState::findSingleRootResourceNode(ResourceNodeType nodeType, ShaderStageEnum stage) const { - unsigned visibilityMask = UINT_MAX; - if (stage != ShaderStage::Invalid) - visibilityMask = 1 << std::min(unsigned(stage), unsigned(ShaderStage::Compute)); +const ResourceNode *PipelineState::findSingleRootResourceNode(ResourceNodeType nodeType, + std::optional stage) const { + ShaderStageMask visibilityMask(ShaderStages); + if (stage) { + ShaderStageEnum maskStage = stage.value(); + if (!ShaderStageMask(ShaderStagesNative).contains(maskStage)) + maskStage = ShaderStage::Compute; + visibilityMask = ShaderStageMask(maskStage); + } for (const ResourceNode &node : getUserDataNodes()) { - if (node.visibility != 0 && (node.visibility & visibilityMask) == 0) + if (!node.visibility.empty() && (node.visibility & visibilityMask).empty()) continue; if (node.concreteType == nodeType) return &node; @@ -1356,7 +1320,7 @@ unsigned PipelineState::getShaderWaveSize(ShaderStageEnum stage) { stage = ShaderStage::Geometry; } - assert(stage <= ShaderStage::Compute); + assert(ShaderStageMask(ShaderStagesNative).contains(stage)); if (!m_waveSize[stage]) setShaderDefaultWaveSize(stage); @@ -1655,10 +1619,9 @@ bool PipelineState::getShaderWgpMode(ShaderStageEnum stage) const { stage = ShaderStage::Geometry; } - assert(stage <= ShaderStage::Compute); - assert(stage < m_shaderOptions.size()); + assert(ShaderStageMask(ShaderStagesNative).contains(stage)); - return m_shaderOptions[stage].wgpMode; + return m_shaderOptions.lookup(stage).wgpMode; } // ===================================================================================================================== @@ -1704,7 +1667,7 @@ bool PipelineState::enableSwXfb() { lastVertexStage = lastVertexStage == ShaderStage::CopyShader ? ShaderStage::Geometry : lastVertexStage; if (!lastVertexStage) { - assert(isUnlinked()); // Unlinked fragment shader or part-pipeline + assert(!isWholePipeline()); // Unlinked fragment shader or part-pipeline return false; } @@ -1735,7 +1698,7 @@ ResourceUsage *PipelineState::getShaderResourceUsage(ShaderStageEnum shaderStage if (shaderStage == ShaderStage::CopyShader) shaderStage = ShaderStage::Geometry; - auto &resUsage = MutableArrayRef>(m_resourceUsage)[shaderStage]; + auto &resUsage = m_resourceUsage[shaderStage]; if (!resUsage) { resUsage = std::make_unique(shaderStage); } @@ -1750,7 +1713,7 @@ InterfaceData *PipelineState::getShaderInterfaceData(ShaderStageEnum shaderStage if (shaderStage == ShaderStage::CopyShader) shaderStage = ShaderStage::Geometry; - auto &intfData = MutableArrayRef>(m_interfaceData)[shaderStage]; + auto &intfData = m_interfaceData[shaderStage]; if (!intfData) { intfData = std::make_unique(); } diff --git a/lgc/state/RayTracingLibrarySummary.cpp b/lgc/state/RayTracingLibrarySummary.cpp index 9ac4fa0eab..2a6c45aa9a 100644 --- a/lgc/state/RayTracingLibrarySummary.cpp +++ b/lgc/state/RayTracingLibrarySummary.cpp @@ -38,7 +38,7 @@ using namespace lgc; namespace { namespace RtLibSummary { -constexpr unsigned MajorVersion = 1; +constexpr unsigned MajorVersion = 2; static constexpr char Version[] = "version"; static constexpr char UsesTraceRay[] = "uses_trace_ray"; @@ -46,9 +46,9 @@ static constexpr char KnownSetRayFlags[] = "ray_flags_known_set"; static constexpr char KnownUnsetRayFlags[] = "ray_flags_known_unset"; static constexpr char MaxRayPayloadSize[] = "max_ray_payload_size"; static constexpr char MaxHitAttributeSize[] = "max_hit_attribute_size"; -static constexpr char MaxUsedPayloadRegisterCount[] = "max_used_payload_register_count"; static constexpr char HasKernelEntry[] = "has_kernel_entry"; static constexpr char HasTraceRayModule[] = "has_trace_ray_module"; +static constexpr char LlvmRaytracingState[] = "llvm_raytracing_state"; } // namespace RtLibSummary } // anonymous namespace @@ -81,9 +81,12 @@ Expected RayTracingLibrarySummary::decodeMsgpack(Strin getUInt(root[RtLibSummary::KnownUnsetRayFlags], rls.knownUnsetRayFlags); getUInt(root[RtLibSummary::MaxRayPayloadSize], rls.maxRayPayloadSize); getUInt(root[RtLibSummary::MaxHitAttributeSize], rls.maxHitAttributeSize); - getUInt(root[RtLibSummary::MaxUsedPayloadRegisterCount], rls.maxUsedPayloadRegisterCount); getBool(root[RtLibSummary::HasKernelEntry], rls.hasKernelEntry); getBool(root[RtLibSummary::HasTraceRayModule], rls.hasTraceRayModule); + auto errorOrState = llvmraytracing::PipelineState::decodeMsgpack(root[RtLibSummary::LlvmRaytracingState]); + if (auto error = errorOrState.takeError()) + return error; + rls.llvmRaytracingState = *errorOrState; return rls; } @@ -100,9 +103,9 @@ std::string RayTracingLibrarySummary::encodeMsgpack() const { root[RtLibSummary::KnownUnsetRayFlags] = knownUnsetRayFlags; root[RtLibSummary::MaxRayPayloadSize] = maxRayPayloadSize; root[RtLibSummary::MaxHitAttributeSize] = maxHitAttributeSize; - root[RtLibSummary::MaxUsedPayloadRegisterCount] = maxUsedPayloadRegisterCount; root[RtLibSummary::HasKernelEntry] = hasKernelEntry; root[RtLibSummary::HasTraceRayModule] = hasTraceRayModule; + llvmRaytracingState.encodeMsgpack(root[RtLibSummary::LlvmRaytracingState]); std::string out; doc.writeToBlob(out); @@ -117,9 +120,9 @@ void RayTracingLibrarySummary::merge(const RayTracingLibrarySummary &other) { } maxRayPayloadSize = std::max(maxRayPayloadSize, other.maxRayPayloadSize); maxHitAttributeSize = std::max(maxHitAttributeSize, other.maxHitAttributeSize); - maxUsedPayloadRegisterCount = std::max(maxUsedPayloadRegisterCount, other.maxUsedPayloadRegisterCount); // TODO: Inherit kernel entry and trace ray module if possible and avoid recompile? hasKernelEntry = false; hasTraceRayModule = false; + llvmRaytracingState.merge(other.llvmRaytracingState); } diff --git a/lgc/state/ShaderModes.cpp b/lgc/state/ShaderModes.cpp index aec7d7699c..fa8d4cffba 100644 --- a/lgc/state/ShaderModes.cpp +++ b/lgc/state/ShaderModes.cpp @@ -51,7 +51,7 @@ static const char ComputeShaderModeMetadataName[] = "llpc.compute.mode"; // ===================================================================================================================== // Clear shader modes void ShaderModes::clear() { - memset(m_commonShaderModes, 0, sizeof(m_commonShaderModes)); + m_commonShaderModes.clear(); } // ===================================================================================================================== @@ -82,14 +82,16 @@ CommonShaderMode ShaderModes::getCommonShaderMode(Module &module, ShaderStageEnu // // @param stage : Shader stage const CommonShaderMode &ShaderModes::getCommonShaderMode(ShaderStageEnum stage) const { - return ArrayRef(m_commonShaderModes)[stage]; + auto mode = m_commonShaderModes.find(stage); + assert(mode != m_commonShaderModes.end()); + return mode->second; } // ===================================================================================================================== // Check if any shader stage has useSubgroupSize set bool ShaderModes::getAnyUseSubgroupSize() const { for (const auto &commonShaderMode : m_commonShaderModes) { - if (commonShaderMode.useSubgroupSize) + if (commonShaderMode.second.useSubgroupSize) return true; } return false; @@ -222,8 +224,8 @@ void ShaderModes::setSubgroupSizeUsage(Module &module, ShaderStageEnum stage, bo // @param module : LLVM module void ShaderModes::readModesFromPipeline(Module *module) { // First the common state. - for (unsigned stage = 0; stage < ArrayRef(m_commonShaderModes).size(); ++stage) - m_commonShaderModes[stage] = getCommonShaderMode(*module, ShaderStageEnum(stage)); + for (auto stage : ShaderStagesNative) + m_commonShaderModes[stage] = getCommonShaderMode(*module, stage); // Then the specific shader modes except tessellation. PipelineState::readNamedMetadataArrayOfInt32(module, GeometryShaderModeMetadataName, m_geometryShaderMode); diff --git a/lgc/state/TargetInfo.cpp b/lgc/state/TargetInfo.cpp index 73f0a1a59e..0b0aae94ce 100644 --- a/lgc/state/TargetInfo.cpp +++ b/lgc/state/TargetInfo.cpp @@ -35,10 +35,38 @@ using namespace lgc; using namespace llvm; +namespace llvm { +namespace cl { +// Define a category for Helper options. +OptionCategory AmdCategory{"Helper Options"}; +} // namespace cl +} // namespace llvm + // -native-wave-size: an option to override hardware native wave size, it will allow compiler to choose // final wave size base on it. Used in pre-silicon verification. static cl::opt NativeWaveSize("native-wave-size", cl::desc("Overrides hardware native wave size"), cl::init(0)); +namespace { + +class TargetInfoPrinter { +public: + void print(); + + void operator=(bool value) { + if (!value) + return; + print(); + exit(0); + } +}; + +TargetInfoPrinter TargetInfoPrinterInstance; + +cl::opt> TargetPrinter{ + "targetInfo", cl::desc("Display the supported device infos."), cl::location(TargetInfoPrinterInstance), + cl::cat(cl::AmdCategory)}; +} // namespace + // ===================================================================================================================== // Functions to set up TargetInfo for the various targets @@ -349,46 +377,49 @@ static void setGfx115FInfo(TargetInfo *targetInfo) { } #endif -// ===================================================================================================================== -// Set TargetInfo. Returns false if the GPU name is not found or not supported. -// -// @param gpuName : LLVM GPU name, e.g. "gfx900" -bool TargetInfo::setTargetInfo(StringRef gpuName) { - struct GpuNameStringMap { - const char *gpuName; - void (*setTargetInfoFunc)(TargetInfo *targetInfo); - }; +// Represents device infos. +struct GpuNameStringMap { + const char *gpuName; + const char *deviceName; + void (*setTargetInfoFunc)(TargetInfo *targetInfo); +}; - static const GpuNameStringMap GpuNameMap[] = { - {"gfx1010", &setGfx1010Info}, // gfx1010 +// The supported device list +static const GpuNameStringMap GpuNameMap[] = { + {"gfx1010", "Navi10", &setGfx1010Info}, // gfx1010 #if LLPC_BUILD_NAVI12 - {"gfx1011", &setGfx1011Info}, // gfx1011, navi12 + {"gfx1011", "Navi12", &setGfx1011Info}, // gfx1011 #endif - {"gfx1012", &setGfx1012Info}, // gfx1012, navi14 - {"gfx1030", &setGfx1030Info}, // gfx1030, navi21 - {"gfx1031", &setGfx1031Info}, // gfx1031, navi22 - {"gfx1032", &setGfx1032Info}, // gfx1032, navi23 - {"gfx1034", &setGfx1034Info}, // gfx1034, navi24 + {"gfx1012", "Navi14", &setGfx1012Info}, // gfx1012 + {"gfx1030", "Navi21", &setGfx1030Info}, // gfx1030 + {"gfx1031", "Navi22", &setGfx1031Info}, // gfx1031 + {"gfx1032", "Navi23", &setGfx1032Info}, // gfx1032 + {"gfx1034", "Navi24", &setGfx1034Info}, // gfx1034 #if LLPC_BUILD_REMBRANDT - {"gfx1035", &setGfx1035Info}, // gfx1035, rembrandt + {"gfx1035", "Rembrandt", &setGfx1035Info}, // gfx1035 #endif #if LLPC_BUILD_RAPHAEL || LLPC_BUILD_MENDOCINO - {"gfx1036", &setGfx1036Info}, // gfx1036, raphael | mendocino + {"gfx1036", "Raphael", &setGfx1036Info}, // gfx1036 #endif - {"gfx1100", &setGfx1100Info}, // gfx1100, navi31 + {"gfx1100", "Navi31", &setGfx1100Info}, // gfx1100 #if LLPC_BUILD_NAVI32 - {"gfx1101", &setGfx1101Info}, // gfx1101, navi32 + {"gfx1101", "Navi32", &setGfx1101Info}, // gfx1101 #endif - {"gfx1102", &setGfx1102Info}, // gfx1102, navi33 + {"gfx1102", "Navi33", &setGfx1102Info}, // gfx1102 #if LLPC_BUILD_PHOENIX1 || LLPC_BUILD_PHOENIX2 - {"gfx1103", &setGfx1103Info}, // gfx1103, phoenix1 + {"gfx1103", "Phoenix1", &setGfx1103Info}, // gfx1103 #endif #if LLPC_BUILD_STRIX1 - {"gfx1150", &setGfx1150Info}, // gfx1150, strix - {"gfx115F", &setGfx115FInfo}, // gfx115F, strix A0 + {"gfx1150", "Strix1", &setGfx1150Info}, // gfx1150 + {"gfx115F", "Strix1 A0", &setGfx115FInfo}, // gfx115F #endif - }; +}; +// ===================================================================================================================== +// Set TargetInfo. Returns false if the GPU name is not found or not supported. +// +// @param gpuName : LLVM GPU name, e.g. "gfx900" +bool TargetInfo::setTargetInfo(StringRef gpuName) { void (*setTargetInfoFunc)(TargetInfo * targetInfo) = nullptr; for (const GpuNameStringMap &mapEntry : ArrayRef(GpuNameMap)) { if (gpuName == mapEntry.gpuName) { @@ -413,3 +444,13 @@ bool TargetInfo::setTargetInfo(StringRef gpuName) { return true; } + +// ===================================================================================================================== +// Print the target infos +void TargetInfoPrinter::print() { + unsigned count = sizeof(GpuNameMap) / sizeof(GpuNameMap[0]); + for (unsigned i = 0; i < count; ++i) { + // Remove substring "gfx" + outs() << StringRef(GpuNameMap[i].gpuName).drop_front(3) << " " << GpuNameMap[i].deviceName << '\n'; + } +} diff --git a/lgc/test/CallLibFromCs-indirect.lgc b/lgc/test/CallLibFromCs-indirect.lgc index 4c5f4a82a8..9f0d275e30 100644 --- a/lgc/test/CallLibFromCs-indirect.lgc +++ b/lgc/test/CallLibFromCs-indirect.lgc @@ -1,6 +1,6 @@ ; Call an extern compute library function from a compute shader. -; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s +; RUN: lgc -mcpu=gfx1010 -print-after=lgc-mutate-entry-point -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s ; CHECK: IR Dump After Patch LLVM for entry-point mutation ; CHECK: define dllexport amdgpu_cs void @lgc.shader.CS.main(i32 inreg noundef %globalTable, ptr addrspace(4) inreg noundef %numWorkgroupsPtr, i32 inreg noundef %userdata0, i32 inreg noundef %userdata1, i32 inreg noundef %userdata2, i32 inreg noundef %userdata3, i32 inreg noundef %userdata4, i32 inreg noundef %userdata5, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %userdata9, i32 inreg noundef %userdata10, i32 inreg noundef %userdata11, i32 inreg noundef %spillTable, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !lgc.shaderstage !7 { ; CHECK: call amdgpu_gfx i32 %func_ptr(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %userdata0, i32 inreg %userdata1, i32 inreg %userdata2, i32 inreg %userdata3, i32 inreg %userdata4, i32 inreg %userdata5, i32 inreg %userdata6, i32 inreg %userdata7, i32 inreg %userdata8, i32 inreg %userdata9, i32 inreg %userdata10, i32 inreg %userdata11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) diff --git a/lgc/test/CallLibFromCs.lgc b/lgc/test/CallLibFromCs.lgc index 3f68d40e8f..57f61b9b4f 100644 --- a/lgc/test/CallLibFromCs.lgc +++ b/lgc/test/CallLibFromCs.lgc @@ -1,6 +1,6 @@ ; Call an extern compute library function from a compute shader. -; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s +; RUN: lgc -mcpu=gfx1010 -print-after=lgc-mutate-entry-point -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s ; CHECK: IR Dump After Patch LLVM for entry-point mutation ; CHECK: declare amdgpu_gfx i32 @compute_library_func() #0 ; CHECK: define dllexport amdgpu_cs void @lgc.shader.CS.main(i32 inreg noundef %globalTable, ptr addrspace(4) inreg noundef %numWorkgroupsPtr, i32 inreg noundef %userdata0, i32 inreg noundef %userdata1, i32 inreg noundef %userdata2, i32 inreg noundef %userdata3, i32 inreg noundef %userdata4, i32 inreg noundef %userdata5, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %userdata9, i32 inreg noundef %userdata10, i32 inreg noundef %userdata11, i32 inreg noundef %spillTable, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #1 !lgc.shaderstage !7 { diff --git a/lgc/test/CsComputeLibrary.lgc b/lgc/test/CsComputeLibrary.lgc index 1368e78100..795fff8f49 100644 --- a/lgc/test/CsComputeLibrary.lgc +++ b/lgc/test/CsComputeLibrary.lgc @@ -1,6 +1,6 @@ ; Define a compute library that can be called from a compute shader. -; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -print-after=lgc-patch-prepare-pipeline-abi -print-after=lgc-patch-setup-target-features -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s +; RUN: lgc -mcpu=gfx1010 -print-after=lgc-mutate-entry-point -print-after=lgc-patch-prepare-pipeline-abi -print-after=lgc-patch-setup-target-features -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s ; CHECK: IR Dump After Patch LLVM for entry-point mutation ; CHECK: define amdgpu_gfx void @func(i32 inreg noundef %globalTable, ptr addrspace(4) inreg noundef %numWorkgroupsPtr, i32 inreg noundef %userdata0, i32 inreg noundef %userdata1, i32 inreg noundef %userdata2, i32 inreg noundef %userdata3, i32 inreg noundef %userdata4, i32 inreg noundef %userdata5, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %userdata9, i32 inreg noundef %userdata10, i32 inreg noundef %userdata11, i32 inreg noundef %spillTable, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !lgc.shaderstage !7 { ; CHECK: !7 = !{i32 7} diff --git a/lgc/test/TaskShaderOps.lgc b/lgc/test/TaskShaderOps.lgc index 88a85fee82..65b1862ae0 100644 --- a/lgc/test/TaskShaderOps.lgc +++ b/lgc/test/TaskShaderOps.lgc @@ -16,7 +16,7 @@ ; CHECK-NEXT: [[dimX:%[0-9]*]] = extractelement <3 x i32> %meshTaskDispatchDims, i64 0 ; CHECK-NEXT: [[tempResult2:%[0-9]*]] = mul i32 [[tempResult1]], [[dimX]] ; CHECK-NEXT: [[flattenId:%[0-9]*]] = add i32 [[tempResult2]], [[groupIdX]] -; CHECK-NEXT: [[entryIndex:%[0-9]*]] = add i32 [[flattenId]], %meshTaskRingIndex +; CHECK-NEXT: [[entryIndex:%[0-9]*]] = add i32 {{(%meshTaskRingIndex, )?}}[[flattenId]]{{(, %meshTaskRingIndex)?}}{{$}} ; CHECK: [[drawDataRingDescPtr:%[0-9]*]] = getelementptr {{i8|<4 x i32>}}, ptr addrspace(4) %{{[0-9]*}}, i64 {{224|14}} ; CHECK-NEXT: [[drawDataRingDesc:%[0-9]*]] = load <4 x i32>, ptr addrspace(4) [[drawDataRingDescPtr]], align 16 ; CHECK: [[payloadRingDescPtr:%[0-9]*]] = getelementptr {{i8|<4 x i32>}}, ptr addrspace(4) %{{[0-9]*}}, i64 {{208|13}} diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc index 7ed782e130..17289c70a5 100644 --- a/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc +++ b/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc @@ -5,13 +5,13 @@ define void @matmul_f16(ptr %ptr) { ; CHECK-LABEL: define void @matmul_f16 ; CHECK-SAME: (ptr [[PTR:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ACCUM_LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0) +; CHECK-NEXT: [[ACCUM_LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0, i32 0) ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[ACCUM_PHI:%.*]] = phi <8 x float> [ [[ACCUM_LOAD]], [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A:%.*]] = call <8 x float> @getmat1() ; CHECK-NEXT: [[B:%.*]] = call <8 x float> @getmat1() -; CHECK-NEXT: [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; CHECK-NEXT: [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; CHECK-NEXT: [[CC:%.*]] = call i1 @getcc() ; CHECK-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] ; CHECK: end: @@ -19,7 +19,7 @@ define void @matmul_f16(ptr %ptr) { ; CHECK-NEXT: ret void ; entry: - %accum.load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0) + %accum.load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0, i32 0) br label %loop loop: @@ -29,7 +29,7 @@ loop: %b = call <8 x float> @getmat1() %accum.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum.phi, i32 1, i32 1, i32 0, i32 1) - %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %accum.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0) %cc = call i1 @getcc() @@ -49,7 +49,7 @@ define void @matmul_f16_initzero(ptr %ptr) { ; CHECK-NEXT: [[ACCUM_PHI:%.*]] = phi <8 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A:%.*]] = call <8 x float> @getmat1() ; CHECK-NEXT: [[B:%.*]] = call <8 x float> @getmat1() -; CHECK-NEXT: [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; CHECK-NEXT: [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; CHECK-NEXT: [[CC:%.*]] = call i1 @getcc() ; CHECK-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] ; CHECK: end: @@ -66,7 +66,7 @@ loop: %b = call <8 x float> @getmat1() %accum.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum.phi, i32 1, i32 1, i32 0, i32 1) - %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %accum.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0) %cc = call i1 @getcc() diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc index 9bff238d2c..29fa46b2bb 100644 --- a/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc +++ b/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc @@ -6,8 +6,8 @@ define void @matmul_f16_pack_simple(ptr %out0, ptr %out1, <8 x float> %a, <8 x f ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]]) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true) @@ -15,8 +15,8 @@ define void @matmul_f16_pack_simple(ptr %out0, ptr %out1, <8 x float> %a, <8 x f ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi) ret void @@ -27,10 +27,10 @@ define void @matmul_f16_pack_chain_sequential(ptr %out0, ptr %out1, <8 x float> ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 false) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]]) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 true) @@ -38,10 +38,10 @@ define void @matmul_f16_pack_chain_sequential(ptr %out0, ptr %out1, <8 x float> ; GFX11-NEXT: ret void ; entry: - %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2) ret void @@ -52,10 +52,10 @@ define void @matmul_f16_pack_chain_alternating(ptr %out0, ptr %out1, <8 x float> ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 false) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]]) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 true) @@ -63,10 +63,10 @@ define void @matmul_f16_pack_chain_alternating(ptr %out0, ptr %out1, <8 x float> ; GFX11-NEXT: ret void ; entry: - %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2) ret void @@ -77,10 +77,10 @@ define void @matmul_f16_pack_chain_nested(ptr %out0, ptr %out1, <8 x float> %a, ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_2]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_2]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 false) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]]) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 true) @@ -88,10 +88,10 @@ define void @matmul_f16_pack_chain_nested(ptr %out0, ptr %out1, <8 x float> %a, ; GFX11-NEXT: ret void ; entry: - %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2) ret void @@ -101,14 +101,14 @@ define void @matmul_f16_no_packable_chain(ptr %out0, ptr %out1, <8 x float> %a, ; GFX11-LABEL: define void @matmul_f16_no_packable_chain ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]]) { ; GFX11-NEXT: entry: -; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_1]]) ; GFX11-NEXT: ret void ; entry: - %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.1) ret void } @@ -118,16 +118,16 @@ define void @matmul_f16_chain_loop(ptr %out0, ptr %out1, <8 x float> %a, <8 x fl ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true) ; GFX11-NEXT: [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]]) ; GFX11-NEXT: br label [[LOOP:%.*]] ; GFX11: loop: ; GFX11-NEXT: [[ACCUM1_PHI:%.*]] = phi <8 x float> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[CHAIN1_2:%.*]], [[LOOP]] ] -; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_2]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_2]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[CC:%.*]] = call i1 @getcc() ; GFX11-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] ; GFX11: end: @@ -138,16 +138,16 @@ define void @matmul_f16_chain_loop(ptr %out0, ptr %out1, <8 x float> %a, <8 x fl ; GFX11-NEXT: ret void ; entry: - %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) br label %loop loop: %accum0.phi = phi <8 x float> [ %chain0.1, %entry ], [ %chain0.2, %loop ] %accum1.phi = phi <8 x float> [ %chain1.1, %entry ], [ %chain1.2, %loop ] - %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %cc = call i1 @getcc() br i1 %cc, label %loop, label %end @@ -171,8 +171,8 @@ define void @matmul_f16_chain_loop_phis(ptr %out0, ptr %out1, <8 x float> %a, <8 ; GFX11-NEXT: [[CC:%.*]] = call i1 @getcc() ; GFX11-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] ; GFX11: loop: -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 false, i1 false, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 false, i1 false, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 false, i1 false, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 false, i1 false, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: br label [[HEADER]] ; GFX11: end: ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[ACCUM1_PHI]], i1 false) @@ -195,8 +195,8 @@ header: loop: %accum0.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum0.phi, i32 1, i32 1, i32 0, i32 1) %accum1.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum1.phi, i32 1, i32 1, i32 0, i32 1) - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1, i32 1) %accum0.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdLo, i32 1, i32 1, i32 1, i32 0) %accum1.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdHi, i32 1, i32 1, i32 1, i32 0) @@ -213,23 +213,23 @@ define void @matmul_f16_chain_branch(ptr %out0, ptr %out1, <8 x float> %a, <8 x ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[CC:%.*]] = call i1 @getcc() ; GFX11-NEXT: br i1 [[CC]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]] ; GFX11: if_true: ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false) -; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true) -; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: br label [[END:%.*]] ; GFX11: if_false: ; GFX11-NEXT: [[A_FALSE:%.*]] = call <8 x float> @getmat1() ; GFX11-NEXT: [[B_FALSE:%.*]] = call <8 x float> @getmat1() ; GFX11-NEXT: [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false) -; GFX11-NEXT: [[CHAIN0_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true) -; GFX11-NEXT: [[CHAIN1_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP4]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP4]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: br label [[END]] ; GFX11: end: ; GFX11-NEXT: [[ACCUM0_PHI:%.*]] = phi <8 x float> [ [[CHAIN0_2]], [[IF_TRUE]] ], [ [[CHAIN0_3]], [[IF_FALSE]] ] @@ -239,22 +239,22 @@ define void @matmul_f16_chain_branch(ptr %out0, ptr %out1, <8 x float> %a, <8 x ; GFX11-NEXT: ret void ; entry: - %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %cc = call i1 @getcc() br i1 %cc, label %if_true, label %if_false if_true: - %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) br label %end if_false: %a.false = call <8 x float> @getmat1() %b.false = call <8 x float> @getmat1() - %chain0.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) br label %end @@ -272,15 +272,15 @@ define void @matmul_f16_chain_diff_bbs(ptr %out0, ptr %out1, <8 x float> %a, <8 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[CC:%.*]] = call i1 @getcc() ; GFX11-NEXT: br label [[CONT:%.*]] ; GFX11: cont: ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false) -; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true) -; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: br label [[END:%.*]] ; GFX11: end: ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN0_2]]) @@ -288,13 +288,13 @@ define void @matmul_f16_chain_diff_bbs(ptr %out0, ptr %out1, <8 x float> %a, <8 ; GFX11-NEXT: ret void ; entry: - %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %cc = call i1 @getcc() br label %cont cont: - %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) br label %end end: @@ -316,8 +316,8 @@ define void @matmul_f16_pack_loop(ptr %out0, ptr %out1) { ; GFX11-NEXT: [[ACCUM1_PHI:%.*]] = phi <8 x float> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[MULADDHI:%.*]], [[LOOP]] ] ; GFX11-NEXT: [[A:%.*]] = call <8 x float> @getmat1() ; GFX11-NEXT: [[B:%.*]] = call <8 x float> @getmat1() -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[CC:%.*]] = call i1 @getcc() ; GFX11-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] ; GFX11: end: @@ -341,8 +341,8 @@ loop: %accum0.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum0.phi, i32 1, i32 1, i32 0, i32 1) %accum1.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum1.phi, i32 1, i32 1, i32 0, i32 1) - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %accum0.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdLo, i32 1, i32 1, i32 1, i32 0) %accum1.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdHi, i32 1, i32 1, i32 1, i32 0) @@ -360,8 +360,8 @@ define void @matmul_f16_pack_scalar_same(ptr %out0, ptr %out1, <8 x float> %a, < ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[MULADDHI]], <2 x half> , i32 6, i32 1) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP1]], i1 false) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]]) @@ -370,8 +370,8 @@ define void @matmul_f16_pack_scalar_same(ptr %out0, ptr %out1, <8 x float> %a, < ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1) %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH310F, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo) @@ -384,8 +384,8 @@ define void @matmul_f16_pack_scalar_different(ptr %out0, ptr %out1, <8 x float> ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[MULADDHI]], <2 x half> , i32 6, i32 1) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP1]], i1 false) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]]) @@ -394,8 +394,8 @@ define void @matmul_f16_pack_scalar_different(ptr %out0, ptr %out1, <8 x float> ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1) %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH3100, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo) @@ -408,8 +408,8 @@ define void @matmul_f16_pack_scalar_only_lo(ptr %out0, ptr %out1, <8 x float> %a ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false) ; GFX11-NEXT: [[SCALEDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP1]], half 0xH310F, i32 1, i32 1) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALEDLO]]) @@ -418,8 +418,8 @@ define void @matmul_f16_pack_scalar_only_lo(ptr %out0, ptr %out1, <8 x float> %a ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi) @@ -431,8 +431,8 @@ define void @matmul_f16_pack_scalar_only_hi(ptr %out0, ptr %out1, <8 x float> %a ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true) ; GFX11-NEXT: [[SCALEDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP1]], half 0xH3100, i32 1, i32 1) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false) @@ -441,8 +441,8 @@ define void @matmul_f16_pack_scalar_only_hi(ptr %out0, ptr %out1, <8 x float> %a ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH3100, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledHi) @@ -454,8 +454,8 @@ define void @matmul_f16_pack_scalar_diff_bbs(ptr %out0, ptr %out1, <8 x float> % ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: br label [[SCALE_LO:%.*]] ; GFX11: scale_lo: ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false) @@ -471,8 +471,8 @@ define void @matmul_f16_pack_scalar_diff_bbs(ptr %out0, ptr %out1, <8 x float> % ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) br label %scale_lo scale_lo: @@ -494,8 +494,8 @@ define void @matmul_f16_pack_user_between_scalar(ptr %out0, ptr %out1, <8 x floa ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: br label [[SCALE:%.*]] ; GFX11: scale: ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false) @@ -510,8 +510,8 @@ define void @matmul_f16_pack_user_between_scalar(ptr %out0, ptr %out1, <8 x floa ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) br label %scale scale: @@ -531,8 +531,8 @@ define void @matmul_f16_pack_factor_between_scalar(ptr %in, ptr %out0, ptr %out1 ; GFX11-SAME: (ptr [[IN:%.*]], ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: br label [[SCALE:%.*]] ; GFX11: scale: ; GFX11-NEXT: [[FACTORHI:%.*]] = load half, ptr [[IN]], align 2 @@ -547,8 +547,8 @@ define void @matmul_f16_pack_factor_between_scalar(ptr %in, ptr %out0, ptr %out1 ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) br label %scale scale: @@ -568,11 +568,11 @@ define void @matmul_f16_pack_binop_fadd(ptr %out0, ptr %out1, <8 x float> %a, <8 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]], <8 x float> [[C2:%.*]], <8 x float> [[C3:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C2]], <8 x float> [[C3]]) -; GFX11-NEXT: [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false) ; GFX11-NEXT: [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false) ; GFX11-NEXT: [[BINOPLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1) @@ -584,10 +584,10 @@ define void @matmul_f16_pack_binop_fadd(ptr %out0, ptr %out1, <8 x float> %a, <8 ; GFX11-NEXT: ret void ; entry: - %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %binOpLo = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3 %binOpHi = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdHi0, <8 x float> %muladdHi1, i32 1, i32 1) #3 call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpLo) @@ -600,11 +600,11 @@ define void @matmul_f16_pack_binop_incompatible_matrices(ptr %out0, ptr %out1, < ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]], <8 x float> [[C2:%.*]], <8 x float> [[C3:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C2]], <8 x float> [[C3]]) -; GFX11-NEXT: [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false) ; GFX11-NEXT: [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false) ; GFX11-NEXT: [[BINOPLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1) @@ -616,10 +616,10 @@ define void @matmul_f16_pack_binop_incompatible_matrices(ptr %out0, ptr %out1, < ; GFX11-NEXT: ret void ; entry: - %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %binOpLo = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3 %binOpHi = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdHi0, <8 x float> %muladdHi0, i32 1, i32 1) #3 call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpLo) @@ -632,11 +632,11 @@ define void @matmul_f16_pack_binop_incompatible_arithop(ptr %out0, ptr %out1, <8 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]], <8 x float> [[C2:%.*]], <8 x float> [[C3:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C2]], <8 x float> [[C3]]) -; GFX11-NEXT: [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false) ; GFX11-NEXT: [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false) ; GFX11-NEXT: [[BINOPLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1) @@ -648,10 +648,10 @@ define void @matmul_f16_pack_binop_incompatible_arithop(ptr %out0, ptr %out1, <8 ; GFX11-NEXT: ret void ; entry: - %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %binOpLo = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3 %binOpHi = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 3, <8 x float> %muladdHi0, <8 x float> %muladdHi1, i32 1, i32 1) #3 call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpLo) @@ -664,15 +664,15 @@ define void @matmul_f16_unpack_before_convert(ptr %out0, ptr %out1, <8 x float> ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false) ; GFX11-NEXT: [[CONVERTLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[TMP1]], i32 1, i32 1, i32 1, i32 0) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 true) ; GFX11-NEXT: [[CONVERTHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[TMP2]], i32 1, i32 1, i32 1, i32 0) ; GFX11-NEXT: [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[CONVERTLO]], <8 x float> [[B]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[CONVERTHI]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[CONVERTLO]], <8 x float> [[B]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[CONVERTHI]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP4]]) ; GFX11-NEXT: [[TMP5:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 true) @@ -680,12 +680,12 @@ define void @matmul_f16_unpack_before_convert(ptr %out0, ptr %out1, <8 x float> ; GFX11-NEXT: ret void ; entry: - %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %convertLo = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdLo0, i32 1, i32 1, i32 1, i32 0) %convertHi = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdHi0, i32 1, i32 1, i32 1, i32 0) - %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %convertLo, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %convertHi, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %convertLo, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %convertHi, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo1) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi1) ret void @@ -695,15 +695,15 @@ define void @matmul_f32_no_pack(ptr %out0, ptr %out1, <8 x float> %a, <8 x float ; GFX11-LABEL: define void @matmul_f32_no_pack ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C1]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2, i32 2) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C1]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2, i32 2) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> [[MULADDLO]]) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> [[MULADDHI]]) ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2, i32 2) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2, i32 2) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> %muladdLo) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> %muladdHi) ret void @@ -713,17 +713,17 @@ define void @matmul_f16_modified_accumulator(ptr %out0, ptr %out1, <8 x float> % ; GFX11-LABEL: define void @matmul_f16_modified_accumulator ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: -; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[ACCUM_C2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[MULADDLO]], <8 x float> [[C1]], i32 1, i32 1) -; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_C2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_C2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[MULADDLO]]) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[MULADDHI]]) ; GFX11-NEXT: ret void ; entry: - %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %accum.c2 = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo, <8 x float> %c1, i32 1, i32 1) - %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi) ret void @@ -734,22 +734,22 @@ define void @matmul_f16_store_between_muladds(ptr %out0, ptr %out1, <8 x float> ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true) -; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN0_2]]) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true) -; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_2]]) ; GFX11-NEXT: ret void ; entry: - %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2) - %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2) ret void } @@ -759,22 +759,22 @@ define void @matmul_f16_store_within_chain(ptr %out0, ptr %out1, <8 x float> %a, ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) { ; GFX11-NEXT: entry: ; GFX11-NEXT: [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) -; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1) -; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1) ; GFX11-NEXT: [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 false) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]]) ; GFX11-NEXT: [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 true) -; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; GFX11-NEXT: [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; GFX11-NEXT: call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_2]]) ; GFX11-NEXT: ret void ; entry: - %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) - %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) + %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2) - %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2) ret void } diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc index 727138420e..9af5b0dacd 100644 --- a/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc +++ b/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc @@ -10,7 +10,7 @@ define <8 x float> @insert_transpose(<8 x float> %x) { ; CHECK: loop: ; CHECK-NEXT: [[V_LOOP:%.*]] = phi <8 x float> [ [[X]], [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[F:%.*]] = call <8 x float> @getmat1() -; CHECK-NEXT: [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; CHECK-NEXT: [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; CHECK-NEXT: [[CC:%.*]] = call i1 @getcc() ; CHECK-NEXT: br i1 [[CC]], label [[LOOP]], label [[END]] ; CHECK: end: @@ -28,7 +28,7 @@ loop: %f = call <8 x float> @getmat1() %pre.t = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %v.loop, i32 1, i32 0) - %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre.t, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre.t, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %v.next = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %muladd, i32 1, i32 0) %cc = call i1 @getcc() @@ -43,12 +43,12 @@ define <8 x float> @reuse_transpose(<8 x float> %x) { ; CHECK-LABEL: define <8 x float> @reuse_transpose ; CHECK-SAME: (<8 x float> [[X:%.*]]) { ; CHECK-NEXT: [[T1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> [[X]], i32 1, i32 0) -; CHECK-NEXT: [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[T1]], <8 x float> [[X]], <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; CHECK-NEXT: [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[T1]], <8 x float> [[X]], <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; CHECK-NEXT: ret <8 x float> [[R]] ; %t1 = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %x, i32 1, i32 0) %t2 = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %t1, i32 1, i32 0) - %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %t1, <8 x float> %t2, <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %t1, <8 x float> %t2, <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ret <8 x float> %r } @@ -63,7 +63,7 @@ define <8 x float> @insert_convert(ptr %ptr) { ; CHECK: loop: ; CHECK-NEXT: [[V_LOOP:%.*]] = phi <8 x float> [ [[LOAD]], [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[F:%.*]] = call <8 x float> @getmat1() -; CHECK-NEXT: [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; CHECK-NEXT: [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; CHECK-NEXT: [[CC:%.*]] = call i1 @getcc() ; CHECK-NEXT: br i1 [[CC]], label [[LOOP]], label [[END]] ; CHECK: end: @@ -81,7 +81,7 @@ loop: %f = call <8 x float> @getmat1() %pre = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 0, i32 1) - %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %v.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0) %cc = call i1 @getcc() @@ -96,12 +96,12 @@ define <8 x float> @reuse_convert(<8 x float> %x) { ; CHECK-LABEL: define <8 x float> @reuse_convert ; CHECK-SAME: (<8 x float> [[X:%.*]]) { ; CHECK-NEXT: [[CVT1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[X]], i32 1, i32 1, i32 0, i32 1) -; CHECK-NEXT: [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[X]], <8 x float> [[X]], <8 x float> [[CVT1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; CHECK-NEXT: [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[X]], <8 x float> [[X]], <8 x float> [[CVT1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; CHECK-NEXT: ret <8 x float> [[R]] ; %cvt1 = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %x, i32 1, i32 1, i32 0, i32 1) %cvt2 = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %cvt1, i32 1, i32 1, i32 1, i32 0) - %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %cvt2, <8 x float> %cvt2, <8 x float> %cvt1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %cvt2, <8 x float> %cvt2, <8 x float> %cvt1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ret <8 x float> %r } @@ -193,7 +193,7 @@ define void @convert_to_acc_inner_chain(ptr %ptr) { ; CHECK: loop: ; CHECK-NEXT: [[V_LOOP:%.*]] = phi <8 x float> [ [[LOAD]], [[ENTRY:%.*]] ], [ [[SCALAR:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[F:%.*]] = call <8 x float> @getmat1() -; CHECK-NEXT: [[MULADD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) +; CHECK-NEXT: [[MULADD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) ; CHECK-NEXT: [[BINOP:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[MULADD]], <8 x float> [[MULADD]], i32 1, i32 1) ; CHECK-NEXT: [[SCALAR]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[BINOP]], half 0xH310F, i32 1, i32 1) ; CHECK-NEXT: [[CC:%.*]] = call i1 @getcc() @@ -214,7 +214,7 @@ loop: %f = call <8 x float> @getmat1() %pre = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 0, i32 1) - %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1) + %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1) %binop = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladd, <8 x float> %muladd, i32 1, i32 1) %scalar = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %binop, half 0xH310F, i32 1, i32 1) %v.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 0) diff --git a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc index cb41cd1279..bc1b2c82e5 100644 --- a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc +++ b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature -; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s +; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s declare void @lgc.cps.jump(i32 %target, i32 %levels, {i32} %state, ...) noreturn diff --git a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc index 8aecb2319d..bc0b8750ba 100644 --- a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature -; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s +; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s declare void @lgc.cps.jump(i32, i32, { i32 }, ...) #0 diff --git a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc index dc9f6d1f1e..bfaeb3c10d 100644 --- a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4 -; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s +; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s %_rgen_1.Frame = type { ptr addrspace(7), ptr addrspace(7), i32 } diff --git a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc index 793f4bbdad..8486eac1cb 100644 --- a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature -; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s +; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s declare void @lgc.cps.jump(...) noreturn declare ptr addrspace(32) @lgc.cps.alloc(i32) diff --git a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc index a2c79432d8..d057bd62b7 100644 --- a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature -; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s +; RUN: lgc -mcpu=gfx1030 -o - -passes="require,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s declare void @lgc.cps.jump(...) noreturn diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc index 97b624e86e..99b85f4344 100644 --- a/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc +++ b/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc @@ -11,7 +11,7 @@ define <8 x i32> @muladd_bf16_bf16(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c) { ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[VALUE1]] to <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP4]] ; - %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7) + %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7, i32 7) ret <8 x i32> %value } @@ -23,7 +23,7 @@ define <8 x float> @muladd_bf16_f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c) ; CHECK-NEXT: [[VALUE1:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <8 x float> [[C]]) ; CHECK-NEXT: ret <8 x float> [[VALUE1]] ; - %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c, i1 false, i1 false, i1 false, i1 false, i32 2, i32 7) + %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7, i32 2) ret <8 x float> %value } diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc index 90ffa98e22..3738960bb8 100644 --- a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc +++ b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc @@ -1,22 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 5 ; RUN: lgc -march=amdgcn -o - --mcpu=gfx1010 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s define void @matmul_f16f32_emulator(ptr addrspace(3) %out0, <8 x float> %a, <8 x float> %b, <8 x float> %c0) !lgc.shaderstage !0 { -; CHECK-NOT: v_dot - %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 1) + %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 2) call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value) ret void } define void @matmul_i16i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 { -; CHECK-NOT: v_dot - %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 4) + %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 4, i32 4, i32 5) call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value) ret void } define void @matmul_i8i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 { -; CHECK-NOT: v_dot - %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 3) + %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 3, i32 3, i32 5) call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value) ret void } @@ -30,3 +28,5 @@ declare void @lgc.cooperative.matrix.store(...) ; Setting Threadgroup Dimensions to 64 x 1 x 1 !llpc.compute.mode = !{!1} !1 = !{i32 64, i32 1, i32 1} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc index 88292bf642..c1aca85e3d 100644 --- a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc +++ b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc @@ -1,22 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 5 ; RUN: lgc -march=amdgcn -o - --mcpu=gfx1011 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s define void @matmul_f16f32_emulator(ptr addrspace(3) %out0, <8 x float> %a, <8 x float> %b, <8 x float> %c0) !lgc.shaderstage !0 { -; CHECK: v_dot2c_f32_f16 - %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 1) + %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 2) call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value) ret void } define void @matmul_i16i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 { -; CHECK: v_dot2_i32_i16 - %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 4) + %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 4, i32 4, i32 5) call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value) ret void } define void @matmul_i8i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 { -; CHECK: v_dot4c_i32_i8 - %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 3) + %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 3, i32 3, i32 5) call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value) ret void } @@ -31,3 +29,5 @@ declare void @lgc.cooperative.matrix.store(...) !llpc.compute.mode = !{!1} !1 = !{i32 64, i32 1, i32 1} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc b/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc index 6fc5998cb8..623c0f6208 100644 --- a/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc +++ b/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 2 -; RUN: lgc --mcpu=gfx1100 -o - -passes="require,module(lgc-lower-desc),module(lgc-patch-entry-point-mutate),function(lgc-patch-buffer-op)" %s | FileCheck --check-prefixes=GFX11 %s +; RUN: lgc --mcpu=gfx1100 -o - -passes="require,module(lgc-lower-desc),module(lgc-mutate-entry-point),function(lgc-patch-buffer-op)" %s | FileCheck --check-prefixes=GFX11 %s define amdgpu_kernel void @strided_buffer_desc_to_ptr(<4 x i32> inreg %desc, ptr %out) { ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr diff --git a/lgc/test/WorkgroupIdOpt.lgc b/lgc/test/WorkgroupIdOpt.lgc index 43d0088202..a33032b127 100644 --- a/lgc/test/WorkgroupIdOpt.lgc +++ b/lgc/test/WorkgroupIdOpt.lgc @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 5 -; RUN: lgc -mcpu=gfx1100 -passes=lgc-patch-entry-point-mutate -o - %s | FileCheck --check-prefixes=CHECK %s +; RUN: lgc -mcpu=gfx1100 -passes=lgc-mutate-entry-point -o - %s | FileCheck --check-prefixes=CHECK %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p32:32:32" target triple = "amdgcn--amdpal" diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc index 4b3903cef3..4ffc869bff 100644 --- a/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc +++ b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc @@ -10,27 +10,35 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] { ; CHECK-NEXT: .entry: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64 [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison) -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], poison +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(4) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP23]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP23]], i32 -1) ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 16 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], poison +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META16:![0-9]+]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP27:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP25]], align 4, !invariant.load [[META16:![0-9]+]] ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]]) ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]]) ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP28]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0) ; CHECK-NEXT: [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]]) -; CHECK-NEXT: [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP3]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]]) ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP1]]) ; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP19]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP26]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP21:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP16]], <8 x i32> [[TMP20]]) ; CHECK-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0) ; CHECK-NEXT: ret void diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc index 21c05a91d3..1a89e812f6 100644 --- a/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc +++ b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc @@ -10,19 +10,27 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] { ; CHECK-NEXT: .entry: +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison) -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], poison +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP17]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP17]], i32 -1) ] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 16 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP5]], align 16, !invariant.load [[META16:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP19]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP19]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP11]], align 16, !invariant.load [[META16:![0-9]+]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> -; CHECK-NEXT: [[TMP14:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP3]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP14:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP20]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]]) ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP8]], i32 [[TMP1]]) ; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP11]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP19]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP13:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP8]], <8 x i32> [[TMP12]]) ; CHECK-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP7]], i32 15, i32 1, <8 x i32> [[TMP13]], i32 0, i32 0) ; CHECK-NEXT: ret void diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc index 748eb7bb27..176dca5ce4 100644 --- a/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc +++ b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc @@ -10,20 +10,28 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] { ; CHECK-NEXT: .entry: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64 [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison) -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], poison +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(4) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP17]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP17]], i32 -1) ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 16 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], poison +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META16:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP21]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]]) ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]]) ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP19]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP13]], <4 x i32> , i1 false, i32 0, i32 0) ; CHECK-NEXT: [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]]) ; CHECK-NEXT: ret void diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc index 29fe33a9e3..1d568c02d8 100644 --- a/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc +++ b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc @@ -12,23 +12,31 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] { ; CHECK-NEXT: .entry: +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison) -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], poison +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP21]] to ptr addrspace(4) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP23]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP23]], i32 -1) ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 16 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], poison +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP24]], align 16, !invariant.load [[META16:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = call ptr addrspace(4) @foo1(i32 [[TMP0]]) -; CHECK-NEXT: [[TMP19:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP26:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP8]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]]) ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP10]], <4 x i32> [[TMP9]]) ; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP11]], i32 [[TMP5]]) ; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP22]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP16:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP11]], <4 x i32> [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP15]], <4 x i32> [[TMP16]], i1 false, i32 0, i32 0) ; CHECK-NEXT: [[TMP18:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP11]], <4 x float> [[TMP17]]) diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc index 845b764733..81b2717e12 100644 --- a/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc +++ b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc @@ -10,30 +10,38 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64 [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison) ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP0]], 0 ; CHECK-NEXT: br i1 [[DOTNOT]], label [[RET:%.*]], label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], poison +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(4) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP23]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP23]], i32 -1) ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 16 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], poison +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP0]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META16:![0-9]+]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP27:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP25]], align 4, !invariant.load [[META16:![0-9]+]] ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]]) ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]]) ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP28]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0) ; CHECK-NEXT: [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]]) -; CHECK-NEXT: [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP3]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]]) ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP1]]) ; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP19]], align 4, !invariant.load [[META16]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP26]], align 4, !invariant.load [[META16]] ; CHECK-NEXT: [[TMP21:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP16]], <8 x i32> [[TMP20]]) ; CHECK-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0) ; CHECK-NEXT: br label [[RET]] diff --git a/lgc/test/tanh.lgc b/lgc/test/tanh.lgc index 977bc458fb..b21cdaf689 100644 --- a/lgc/test/tanh.lgc +++ b/lgc/test/tanh.lgc @@ -33,22 +33,12 @@ define float @sample(float %x) !lgc.shaderstage !1 { ret float %y } -; Function Attrs: nounwind willreturn memory(read) -declare !lgc.create.opcode !2 i32 @lgc.create.read.builtin.input.i32(...) #0 - -; Function Attrs: nounwind willreturn memory(none) -declare ptr addrspace(7) @lgc.load.buffer.desc(i64, i32, i32, i32) #1 - ; Function Attrs: nounwind memory(none) -declare !lgc.create.opcode !3 float @lgc.create.tanh.f32(...) #2 +declare float @lgc.create.tanh.f32(...) #2 -attributes #0 = { nounwind willreturn memory(read) } -attributes #1 = { nounwind willreturn memory(none) } -attributes #2 = { nounwind memory(none) } +attributes #0 = { nounwind memory(none) } !llpc.compute.mode = !{!0} !0 = !{i32 8, i32 8, i32 1} !1 = !{i32 7} -!2 = !{i32 77} -!3 = !{i32 17} diff --git a/lgc/util/GfxRegHandler.cpp b/lgc/util/GfxRegHandler.cpp index 3d764c2951..c91b353861 100644 --- a/lgc/util/GfxRegHandler.cpp +++ b/lgc/util/GfxRegHandler.cpp @@ -181,6 +181,9 @@ static constexpr BitsInfo SqImgRsrcRegBitsGfx10[static_cast(SqRsrcRegs {1, 30, 2}, // WidthLo {2, 0, 12}, // WidthHi {5, 0, 4}, // ArrayPitch + {1, 8, 12}, // MinLod + {}, // MinLodLo + {}, // MinLodHi }; // ===================================================================================================================== @@ -204,6 +207,9 @@ static constexpr BitsInfo SqImgRsrcRegBitsGfx11[static_cast(SqRsrcRegs {1, 30, 2}, // WidthLo {2, 0, 12}, // WidthHi {5, 0, 4}, // ArrayPitch + {}, // MinLod + {5, 27, 5}, // MinLodLo + {6, 0, 7}, // MinLodHi }; // ===================================================================================================================== @@ -247,6 +253,17 @@ Value *SqImgRsrcRegHandler::getReg(SqRsrcRegs regId) { case SqRsrcRegs::BaseArray: case SqRsrcRegs::ArrayPitch: return getRegCommon(static_cast(regId)); + case SqRsrcRegs::MinLod: + switch (m_gfxIpVersion->major) { + case 10: + return getRegCommon(static_cast(regId)); + case 11: + return getRegCombine(static_cast(SqRsrcRegs::MinLodLo), static_cast(SqRsrcRegs::MinLodHi)); + default: + llvm_unreachable("GFX IP is not supported!"); + break; + } + break; case SqRsrcRegs::Depth: case SqRsrcRegs::Height: case SqRsrcRegs::Pitch: @@ -281,10 +298,26 @@ void SqImgRsrcRegHandler::setReg(SqRsrcRegs regId, Value *regValue) { case SqRsrcRegs::DstSelXYZW: case SqRsrcRegs::SwizzleMode: case SqRsrcRegs::Type: - case SqRsrcRegs::Depth: case SqRsrcRegs::BcSwizzle: + case SqRsrcRegs::BaseLevel: + case SqRsrcRegs::LastLevel: + case SqRsrcRegs::BaseArray: setRegCommon(static_cast(regId), regValue); break; + case SqRsrcRegs::MinLod: + switch (m_gfxIpVersion->major) { + case 10: + setRegCommon(static_cast(regId), regValue); + break; + case 11: + setRegCombine(static_cast(SqRsrcRegs::MinLodLo), static_cast(SqRsrcRegs::MinLodHi), regValue); + break; + default: + llvm_unreachable("GFX IP is not supported!"); + break; + } + break; + case SqRsrcRegs::Depth: case SqRsrcRegs::Height: case SqRsrcRegs::Pitch: setRegCommon(static_cast(regId), m_builder->CreateSub(regValue, m_one)); diff --git a/lgc/util/ModuleBunch.cpp b/lgc/util/ModuleBunch.cpp index 7a2f747f4a..0ecf4be684 100644 --- a/lgc/util/ModuleBunch.cpp +++ b/lgc/util/ModuleBunch.cpp @@ -77,8 +77,12 @@ bool ModuleBunch::isNormalized() const { /// to Module::print for each module. void ModuleBunch::print(raw_ostream &OS, AssemblyAnnotationWriter *AAW, bool ShouldPreserveUseListOrder, bool IsForDebug) const { - for (const Module &M : *this) - M.print(OS, AAW, ShouldPreserveUseListOrder, IsForDebug); + for (const std::unique_ptr &M : Modules) { + if (!M) + OS << "\n"; + else + M->print(OS, AAW, ShouldPreserveUseListOrder, IsForDebug); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llpc/CMakeLists.txt b/llpc/CMakeLists.txt index 748b948e98..825530be64 100644 --- a/llpc/CMakeLists.txt +++ b/llpc/CMakeLists.txt @@ -186,7 +186,7 @@ target_include_directories(llpcinternal include ../include context - lower + lowering translator/include translator/lib/SPIRV translator/lib/SPIRV/libSPIRV @@ -214,28 +214,29 @@ if(ICD_BUILD_LLPC) context/llpcRayTracingContext.cpp ) -# llpc/lower +# llpc/lowering target_sources(llpcinternal PRIVATE - lower/llpcSpirvLower.cpp - lower/LowerAccessChain.cpp - lower/LowerCfgMerges.cpp - lower/LowerConstImmediateStore.cpp - lower/LowerGlobals.cpp - lower/LowerInstMetaRemove.cpp - lower/LowerMath.cpp - lower/LowerMemoryOp.cpp - lower/LowerPostInline.cpp - lower/LowerRayTracing.cpp - lower/LowerTerminator.cpp - lower/LowerTranslator.cpp - lower/llpcSpirvLowerUtil.cpp - lower/ProcessGpuRtLibrary.cpp - lower/LowerInternalLibraryIntrinsic.cpp - lower/LowerGLCompatibility.cpp - lower/LowerCooperativeMatrix.cpp - lower/PrepareContinuations.cpp - lower/LowerAdvancedBlend.cpp - lower/ProcessGfxRuntimeLibrary.cpp + lowering/Lowering.cpp + lowering/LowerAccessChain.cpp + lowering/LowerCfgMerges.cpp + lowering/LowerConstImmediateStore.cpp + lowering/LowerGlobals.cpp + lowering/LowerInstMetaRemove.cpp + lowering/LowerMath.cpp + lowering/LowerMemoryOp.cpp + lowering/LowerPostInline.cpp + lowering/LowerRayTracing.cpp + lowering/LowerTerminator.cpp + lowering/LowerTranslator.cpp + lowering/LoweringUtil.cpp + lowering/ProcessGpuRtLibrary.cpp + lowering/LowerInternalLibraryIntrinsic.cpp + lowering/LowerGlCompatibility.cpp + lowering/ScalarReplacementOfBuiltins.cpp + lowering/LowerCooperativeMatrix.cpp + lowering/PrepareContinuations.cpp + lowering/LowerAdvancedBlend.cpp + lowering/ProcessGfxRuntimeLibrary.cpp ) # llpc/translator @@ -385,7 +386,7 @@ target_include_directories(llpc_standalone_compiler PUBLIC ${PROJECT_SOURCE_DIR}/../util ${PROJECT_SOURCE_DIR}/context ${PROJECT_SOURCE_DIR}/include - ${PROJECT_SOURCE_DIR}/lower + ${PROJECT_SOURCE_DIR}/lowering ${PROJECT_SOURCE_DIR}/tool ${PROJECT_SOURCE_DIR}/translator/include ${PROJECT_SOURCE_DIR}/translator/lib/SPIRV diff --git a/llpc/context/llpcCompiler.cpp b/llpc/context/llpcCompiler.cpp index 433757dac3..fb897e9acd 100644 --- a/llpc/context/llpcCompiler.cpp +++ b/llpc/context/llpcCompiler.cpp @@ -34,6 +34,8 @@ #include "LowerCfgMerges.h" #include "LowerRayTracing.h" #include "LowerTranslator.h" +#include "Lowering.h" +#include "LoweringUtil.h" #include "PrepareContinuations.h" #include "SPIRVEntry.h" #include "SPIRVFunction.h" @@ -49,8 +51,6 @@ #include "llpcGraphicsContext.h" #include "llpcRayTracingContext.h" #include "llpcShaderModuleHelper.h" -#include "llpcSpirvLower.h" -#include "llpcSpirvLowerUtil.h" #include "llpcThreading.h" #include "llpcTimerProfiler.h" #include "llpcUtil.h" @@ -78,6 +78,7 @@ #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/Support/ErrorHandling.h" + #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 442438 // Old version of the code #else @@ -1855,7 +1856,7 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRefaddPass(SpirvLowerTranslator(entryStage, shaderInfoEntry)); + lowerPassMgr->addPass(LowerTranslator(entryStage, shaderInfoEntry)); if (EnableOuts()) { lowerPassMgr->addPass( PrintModulePass(outs(), "\n" @@ -2019,7 +2020,7 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRef lock(getHelperThreadMutex()); - rtContext->getRayTracingLibrarySummary().maxUsedPayloadRegisterCount = - std::max(rtContext->getRayTracingLibrarySummary().maxUsedPayloadRegisterCount, maxUsedPayloadRegisterCount); + rtContext->getRayTracingLibrarySummary().llvmRaytracingState.merge(*moduleStateOrErr); } } @@ -3107,7 +3110,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext, SpirvLower::registerTranslationPasses(*lowerPassMgr); // SPIR-V translation, then dump the result. - lowerPassMgr->addPass(SpirvLowerTranslator(shaderInfoEntry->entryStage, shaderInfoEntry)); + lowerPassMgr->addPass(LowerTranslator(shaderInfoEntry->entryStage, shaderInfoEntry)); lowerPassMgr->addPass(LowerCfgMerges()); lowerPassMgr->addPass(AlwaysInlinerPass()); @@ -3312,8 +3315,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext, // Build traversal at last after we gather all needed information. if (traversalModule) { if (isContinuationsMode) - ContHelper::setPreservedPayloadRegisterCount(*traversalModule, - rtContext.getRayTracingLibrarySummary().maxUsedPayloadRegisterCount); + rtContext.getRayTracingLibrarySummary().llvmRaytracingState.exportModuleMetadata(*traversalModule); auto rayFlagsKnownBits = rtContext.getRayFlagsKnownBits(); lgc::gpurt::setKnownSetRayFlags(*traversalModule, rayFlagsKnownBits.One.getZExtValue()); diff --git a/llpc/context/llpcComputeContext.h b/llpc/context/llpcComputeContext.h index 7dccf683bc..f554ff9f9d 100644 --- a/llpc/context/llpcComputeContext.h +++ b/llpc/context/llpcComputeContext.h @@ -40,7 +40,7 @@ class ComputeContext : public PipelineContext { public: ComputeContext(GfxIpVersion gfxIp, const ComputePipelineBuildInfo *pipelineInfo, MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash); - virtual ~ComputeContext() {} + virtual ~ComputeContext() = default; virtual PipelineType getPipelineType() const override { return PipelineType::Compute; } diff --git a/llpc/context/llpcContext.cpp b/llpc/context/llpcContext.cpp index c1476c6411..661fb3f60d 100644 --- a/llpc/context/llpcContext.cpp +++ b/llpc/context/llpcContext.cpp @@ -34,13 +34,13 @@ #include "LowerCfgMerges.h" #include "LowerGlobals.h" #include "LowerTranslator.h" +#include "Lowering.h" #include "ProcessGfxRuntimeLibrary.h" #include "ProcessGpuRtLibrary.h" #include "SPIRVInternal.h" #include "llpcCompiler.h" #include "llpcDebug.h" #include "llpcPipelineContext.h" -#include "llpcSpirvLower.h" #include "llpcTimerProfiler.h" #include "vkgcMetroHash.h" #include "gfxruntime/GfxRuntimeLibrary.h" @@ -100,10 +100,6 @@ Context::Context(GfxIpVersion gfxIp) : LLVMContext(), m_gfxIp(gfxIp) { reset(); } -// ===================================================================================================================== -Context::~Context() { -} - // ===================================================================================================================== void Context::reset() { m_pipelineContext = nullptr; @@ -127,7 +123,6 @@ LgcContext *Context::getLgcContext() { lgc::GpurtContext::get(*this).theModule = nullptr; lgc::GpurtContext::get(*this).ownedTheModule.reset(); lgc::GfxRuntimeContext::get(*this).theModule.reset(); - // Pass the state of LLPC_OUTS on to LGC. LgcContext::setLlpcOuts(EnableOuts() ? &outs() : nullptr); } @@ -273,7 +268,7 @@ void Context::ensureGpurtLibrary() { timerProfiler.addTimerStartStopPass(*lowerPassMgr, TimerTranslate, true); - lowerPassMgr->addPass(SpirvLowerTranslator(ShaderStageCompute, &shaderInfo, "_gpurtvar_")); + lowerPassMgr->addPass(LowerTranslator(ShaderStageCompute, &shaderInfo, "_gpurtvar_")); if (EnableOuts()) { lowerPassMgr->addPass( PrintModulePass(outs(), "\n" @@ -335,7 +330,7 @@ void Context::ensureGfxRuntimeLibrary() { timerProfiler.addTimerStartStopPass(*lowerPassMgr, TimerTranslate, true); - lowerPassMgr->addPass(SpirvLowerTranslator(ShaderStageCompute, &shaderInfo)); + lowerPassMgr->addPass(LowerTranslator(ShaderStageCompute, &shaderInfo)); if (EnableOuts()) { lowerPassMgr->addPass( PrintModulePass(outs(), "\n" diff --git a/llpc/context/llpcContext.h b/llpc/context/llpcContext.h index ebc730a825..87ff860b3f 100644 --- a/llpc/context/llpcContext.h +++ b/llpc/context/llpcContext.h @@ -51,7 +51,6 @@ namespace Llpc { class Context : public llvm::LLVMContext { public: Context(GfxIpVersion gfxIp); - ~Context(); void reset(); diff --git a/llpc/context/llpcGraphicsContext.cpp b/llpc/context/llpcGraphicsContext.cpp index 762981b296..18492fd680 100644 --- a/llpc/context/llpcGraphicsContext.cpp +++ b/llpc/context/llpcGraphicsContext.cpp @@ -88,10 +88,6 @@ GraphicsContext::GraphicsContext(GfxIpVersion gfxIp, const GraphicsPipelineBuild m_pipelineApiHash = pipelineInfo->pipelineApiHash; } -// ===================================================================================================================== -GraphicsContext::~GraphicsContext() { -} - // ===================================================================================================================== // Gets pipeline shader info of the specified shader stage // diff --git a/llpc/context/llpcGraphicsContext.h b/llpc/context/llpcGraphicsContext.h index 247cf318f2..ac5515ca78 100644 --- a/llpc/context/llpcGraphicsContext.h +++ b/llpc/context/llpcGraphicsContext.h @@ -42,7 +42,7 @@ class GraphicsContext : public PipelineContext { public: GraphicsContext(GfxIpVersion gfxIp, const GraphicsPipelineBuildInfo *pipelineInfo, MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash); - virtual ~GraphicsContext(); + virtual ~GraphicsContext() = default; virtual PipelineType getPipelineType() const override { return PipelineType::Graphics; } diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp index 21ba35235f..6799fd4ae1 100644 --- a/llpc/context/llpcPipelineContext.cpp +++ b/llpc/context/llpcPipelineContext.cpp @@ -144,10 +144,6 @@ PipelineContext::PipelineContext(GfxIpVersion gfxIp, MetroHash::Hash *pipelineHa : m_gfxIp(gfxIp), m_pipelineHash(*pipelineHash), m_cacheHash(*cacheHash) { } -// ===================================================================================================================== -PipelineContext::~PipelineContext() { -} - // ===================================================================================================================== // Gets the hash code of input shader with specified shader stage. // @@ -202,7 +198,7 @@ void PipelineContext::setRayTracingState(const Vkgc::RtState &rtState, const Vkg m_rtState.rtIpVersion = Vkgc::gpurt::getRtIpVersion(m_gfxIp); if (m_rtState.rtIpVersion.major != 0 && !m_rtState.gpurtOverride) { - gpurt::getShaderLibrarySpirv(m_rtState.gpurtFeatureFlags, m_rtState.gpurtShaderLibrary.pCode, + gpurt::getShaderLibrarySpirv(m_rtState.rtIpVersion, m_rtState.gpurtFeatureFlags, m_rtState.gpurtShaderLibrary.pCode, m_rtState.gpurtShaderLibrary.codeSize); gpurt::getFuncTable(m_rtState.rtIpVersion, m_rtState.gpurtFuncTable); } @@ -401,7 +397,7 @@ void PipelineContext::convertResourceNode(ResourceNode &dst, const ResourceMappi dst.sizeInDwords = src.sizeInDwords; dst.offsetInDwords = src.offsetInDwords; dst.abstractType = ResourceNodeType::Unknown; - dst.visibility = visibility; + dst.visibility = ShaderStageMask::fromRaw(visibility); switch (src.type) { case ResourceMappingNodeType::DescriptorTableVaPtr: { diff --git a/llpc/context/llpcPipelineContext.h b/llpc/context/llpcPipelineContext.h index b00ed8a898..0a3d7caa4b 100644 --- a/llpc/context/llpcPipelineContext.h +++ b/llpc/context/llpcPipelineContext.h @@ -119,7 +119,7 @@ enum class PipelineType { class PipelineContext { public: PipelineContext(GfxIpVersion gfxIp, MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash); - virtual ~PipelineContext(); + virtual ~PipelineContext() = default; // Returns the pipeline type virtual PipelineType getPipelineType() const = 0; diff --git a/llpc/context/llpcRayTracingContext.h b/llpc/context/llpcRayTracingContext.h index c520c81854..12b77904ca 100644 --- a/llpc/context/llpcRayTracingContext.h +++ b/llpc/context/llpcRayTracingContext.h @@ -49,7 +49,7 @@ class RayTracingContext : public PipelineContext { RayTracingContext(GfxIpVersion gfxIp, const RayTracingPipelineBuildInfo *pipelineInfo, const PipelineShaderInfo *representativeShaderInfo, MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash, unsigned indirectStageMask); - virtual ~RayTracingContext() {} + virtual ~RayTracingContext() = default; virtual PipelineType getPipelineType() const override { return PipelineType::RayTracing; } diff --git a/llpc/docs/DdnBindlessTexture.md b/llpc/docs/DdnBindlessTexture.md index 30f506efde..f55908467a 100644 --- a/llpc/docs/DdnBindlessTexture.md +++ b/llpc/docs/DdnBindlessTexture.md @@ -371,7 +371,7 @@ void main() ... ``` -After the above change, we can see the pipeline dumps for the above shader, the pass “LLPC translate SPIR-V binary to LLVM IR” and the ISA code dump looks as following, the cases that declare bindless textures by as uniform uvec2 type can run correctly. +After the above change, we can see the pipeline dumps for the above shader, the pass "LLPC translate SPIR-V binary to LLVM IR" and the ISA code dump looks as following, the cases that declare bindless textures by as uniform uvec2 type can run correctly. ![](./DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG) diff --git a/llpc/docs/DdnInterShaderDataCacheTracking.md b/llpc/docs/DdnInterShaderDataCacheTracking.md index f46d1d6278..6d5aaac07c 100644 --- a/llpc/docs/DdnInterShaderDataCacheTracking.md +++ b/llpc/docs/DdnInterShaderDataCacheTracking.md @@ -22,7 +22,7 @@ Historical Background At the time of writing, LLPC already uses variant 2 described in the introduction. It does so by combining various pieces of information that have been saved off in a look-aside data structure (`ResourceUsage`) in the -`PatchCheckShaderCache` pass. +`CheckShaderCache` pass. This approach has two downsides: @@ -30,7 +30,7 @@ This approach has two downsides: the information may otherwise be obsolete. 2. Knowledge about what kind of full pipeline optimizations are applied, including some of their details, is - centralized in `PatchCheckShaderCache`. This limits the design's extensibility. + centralized in `CheckShaderCache`. This limits the design's extensibility. Implementation overview ----------------------- @@ -39,7 +39,7 @@ Every function in LLPC gets an attached `!llpc.hash` metadata node. This metadata node is initialized with the relevant input shader hash. Every pass that performs inter-shader transforms updates the metadata by hashing the old hash together with any data that is relevant from other shaders, i.e. it computes `h_new = h(h_old | inter-shader data)`. -The metadata node is finally inspected in the `PatchCheckShaderCache` pass. +The metadata node is finally inspected in the `CheckShaderCache` pass. Extensible specialized metadata ------------------------------- diff --git a/llpc/include/llpc.h b/llpc/include/llpc.h index f74d043222..86eb8986de 100644 --- a/llpc/include/llpc.h +++ b/llpc/include/llpc.h @@ -220,7 +220,7 @@ class IShaderCache { IShaderCache() {} /// @internal Destructor. Prevent use of delete operator on this interface. - virtual ~IShaderCache() {} + virtual ~IShaderCache() = default; }; #endif @@ -363,7 +363,7 @@ class ICompiler { protected: ICompiler() {} /// Destructor - virtual ~ICompiler() {} + virtual ~ICompiler() = default; }; } // namespace Llpc diff --git a/llpc/lower/LowerAccessChain.cpp b/llpc/lowering/LowerAccessChain.cpp similarity index 100% rename from llpc/lower/LowerAccessChain.cpp rename to llpc/lowering/LowerAccessChain.cpp diff --git a/llpc/lower/LowerAccessChain.h b/llpc/lowering/LowerAccessChain.h similarity index 99% rename from llpc/lower/LowerAccessChain.h rename to llpc/lowering/LowerAccessChain.h index 4d551b91b6..58fd43d875 100644 --- a/llpc/lower/LowerAccessChain.h +++ b/llpc/lowering/LowerAccessChain.h @@ -30,7 +30,7 @@ */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" diff --git a/llpc/lower/LowerAdvancedBlend.cpp b/llpc/lowering/LowerAdvancedBlend.cpp similarity index 100% rename from llpc/lower/LowerAdvancedBlend.cpp rename to llpc/lowering/LowerAdvancedBlend.cpp diff --git a/llpc/lower/LowerAdvancedBlend.h b/llpc/lowering/LowerAdvancedBlend.h similarity index 98% rename from llpc/lower/LowerAdvancedBlend.h rename to llpc/lowering/LowerAdvancedBlend.h index 61412ecada..3a1af04f3d 100644 --- a/llpc/lower/LowerAdvancedBlend.h +++ b/llpc/lowering/LowerAdvancedBlend.h @@ -30,7 +30,7 @@ */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/ADT/DenseMap.h" #include "llvm/IR/PassManager.h" diff --git a/llpc/lower/LowerCfgMerges.cpp b/llpc/lowering/LowerCfgMerges.cpp similarity index 99% rename from llpc/lower/LowerCfgMerges.cpp rename to llpc/lowering/LowerCfgMerges.cpp index faba98c13b..e293b81caa 100644 --- a/llpc/lower/LowerCfgMerges.cpp +++ b/llpc/lowering/LowerCfgMerges.cpp @@ -36,11 +36,11 @@ *********************************************************************************************************************** */ #include "LowerCfgMerges.h" +#include "Lowering.h" +#include "LoweringUtil.h" #include "SPIRVInternal.h" #include "llpcContext.h" #include "llpcDebug.h" -#include "llpcSpirvLower.h" -#include "llpcSpirvLowerUtil.h" #include "lgc/Builder.h" #include "lgc/LgcDialect.h" #include "llvm/ADT/DepthFirstIterator.h" diff --git a/llpc/lower/LowerCfgMerges.h b/llpc/lowering/LowerCfgMerges.h similarity index 98% rename from llpc/lower/LowerCfgMerges.h rename to llpc/lowering/LowerCfgMerges.h index 0cdba4218e..2e22ac5328 100644 --- a/llpc/lower/LowerCfgMerges.h +++ b/llpc/lowering/LowerCfgMerges.h @@ -30,7 +30,7 @@ */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/ADT/DenseSet.h" #include "llvm/IR/PassManager.h" diff --git a/llpc/lower/LowerConstImmediateStore.cpp b/llpc/lowering/LowerConstImmediateStore.cpp similarity index 95% rename from llpc/lower/LowerConstImmediateStore.cpp rename to llpc/lowering/LowerConstImmediateStore.cpp index 4c0ae2502e..2311e304fd 100644 --- a/llpc/lower/LowerConstImmediateStore.cpp +++ b/llpc/lowering/LowerConstImmediateStore.cpp @@ -25,7 +25,7 @@ /** *********************************************************************************************************************** * @file LowerConstImmediateStore.cpp - * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerConstImmediateStore. + * @brief LLPC source file: contains implementation of class Llpc::LowerConstImmediateStore. *********************************************************************************************************************** */ #include "LowerConstImmediateStore.h" @@ -38,7 +38,7 @@ #include "llvm/Support/Debug.h" #include -#define DEBUG_TYPE "llpc-spirv-lower-const-immediate-store" +#define DEBUG_TYPE "lower-const-immediate-store" using namespace llvm; using namespace SPIRV; @@ -51,8 +51,8 @@ namespace Llpc { // // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerConstImmediateStore::run(Module &module, ModuleAnalysisManager &analysisManager) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Const-Immediate-Store\n"); +PreservedAnalyses LowerConstImmediateStore::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-Const-Immediate-Store\n"); SpirvLower::init(&module); @@ -75,7 +75,7 @@ PreservedAnalyses SpirvLowerConstImmediateStore::run(Module &module, ModuleAnaly // can be optimized to a read-only global variable. // // @param func : Function to process -bool SpirvLowerConstImmediateStore::processAllocaInsts(Function *func) { +bool LowerConstImmediateStore::processAllocaInsts(Function *func) { // NOTE: We only visit the entry block on the basis that SPIR-V translator puts all "alloca" // instructions there. bool changed = false; @@ -98,7 +98,7 @@ bool SpirvLowerConstImmediateStore::processAllocaInsts(Function *func) { // // @param allocaInst : The "alloca" instruction to process // @return true if the alloca was replaced -bool SpirvLowerConstImmediateStore::tryProcessAlloca(AllocaInst *allocaInst) { +bool LowerConstImmediateStore::tryProcessAlloca(AllocaInst *allocaInst) { // LLVM IR allocas can have an "arrayness" where multiple elements of the allocated type are allocated at once. // SPIR-V doesn't have this (because it only has OpVariable and not a "true" alloca), but let's guard against it // anyway just in case. diff --git a/llpc/lower/LowerConstImmediateStore.h b/llpc/lowering/LowerConstImmediateStore.h similarity index 93% rename from llpc/lower/LowerConstImmediateStore.h rename to llpc/lowering/LowerConstImmediateStore.h index 2df8c0b1c4..ba7828e83f 100644 --- a/llpc/lower/LowerConstImmediateStore.h +++ b/llpc/lowering/LowerConstImmediateStore.h @@ -25,12 +25,12 @@ /** *********************************************************************************************************************** * @file LowerConstImmediateStore.h - * @brief LLPC header file: contains declaration of class Llpc::SpirvLowerConstImmediateStore. + * @brief LLPC header file: contains declaration of class Llpc::LowerConstImmediateStore. *********************************************************************************************************************** */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/IR/PassManager.h" namespace llvm { @@ -43,7 +43,7 @@ namespace Llpc { // ===================================================================================================================== // Represents the pass of SPIR-V lowering operations for constant immediate store -class SpirvLowerConstImmediateStore : public SpirvLower, public llvm::PassInfoMixin { +class LowerConstImmediateStore : public SpirvLower, public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); diff --git a/llpc/lower/LowerCooperativeMatrix.cpp b/llpc/lowering/LowerCooperativeMatrix.cpp similarity index 98% rename from llpc/lower/LowerCooperativeMatrix.cpp rename to llpc/lowering/LowerCooperativeMatrix.cpp index 432f8279c4..5f501eb987 100644 --- a/llpc/lower/LowerCooperativeMatrix.cpp +++ b/llpc/lowering/LowerCooperativeMatrix.cpp @@ -39,7 +39,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" -#define DEBUG_TYPE "llpc-spirv-lower-cooperative-matrix" +#define DEBUG_TYPE "lower-cooperative-matrix" using namespace llvm; using namespace lgc; @@ -166,7 +166,7 @@ void LowerCooperativeMatrix::visitPointerUsers(Value *ptr, CooperativeMatrixElem // // @param [in/out] module : LLVM module to be run on // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerCooperativeMatrix::run(Module &module, ModuleAnalysisManager &analysisManager) { +PreservedAnalyses LowerCooperativeMatrixProxy::run(Module &module, ModuleAnalysisManager &analysisManager) { LowerCooperativeMatrix impl{module}; return impl.run(); } diff --git a/llpc/lower/LowerCooperativeMatrix.h b/llpc/lowering/LowerCooperativeMatrix.h similarity index 92% rename from llpc/lower/LowerCooperativeMatrix.h rename to llpc/lowering/LowerCooperativeMatrix.h index ea15854624..52d60e1b47 100644 --- a/llpc/lower/LowerCooperativeMatrix.h +++ b/llpc/lowering/LowerCooperativeMatrix.h @@ -36,11 +36,11 @@ namespace Llpc { // ===================================================================================================================== // Pass that lower SPIR-V-specific cooperative matrix operations -class SpirvLowerCooperativeMatrix : public llvm::PassInfoMixin { +class LowerCooperativeMatrixProxy : public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); - static llvm::StringRef name() { return "spirv-lower-cooperative-matrix"; } + static llvm::StringRef name() { return "lower-cooperative-matrix"; } }; } // namespace Llpc diff --git a/llpc/lowering/LowerExecutionGraph.cpp b/llpc/lowering/LowerExecutionGraph.cpp new file mode 100644 index 0000000000..a6d3de614c --- /dev/null +++ b/llpc/lowering/LowerExecutionGraph.cpp @@ -0,0 +1,986 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file LowerExecutionGraph.cpp + * @brief LLPC source file: contains implementation of class Llpc::LowerExecutionGraph. + *********************************************************************************************************************** + */ +#include "LowerExecutionGraph.h" +#include "SPIRVInternal.h" +#include "llpcContext.h" +#include "llpcExecutionGraphContext.h" +#include "compilerutils/CompilerUtils.h" +#include "compilerutils/TypeLowering.h" +#include "lgc/Builder.h" +#include "lgc/BuiltIns.h" +#include "lgc/LgcDialect.h" +#include "lgc/LgcWgDialect.h" +#include "lgc/Pipeline.h" +#include "lgc/RuntimeContext.h" +#include "llvm/IR/DerivedUser.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/Operator.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "lower-execution-graph" + +using namespace CompilerUtils; +using namespace lgc; +using namespace lgc::wg; +using namespace llvm; +using namespace Llpc; +using namespace spv; +using namespace llvm_dialects; + +namespace SPIRV { +extern const char *MetaNameSpirvOp; +} // namespace SPIRV + +using namespace SPIRV; +namespace WorkGraphFunc { +enum : unsigned { + ShaderPreamble = 0, // Preamble function + ShaderPostamble, // Postamble function + OutputAllocate, // Allocates output records for a successor node. + OutputCommit, // Commits previously-allocated output records for a successor node. + OutputGetPayload, // Retrieves the GPU VA of a specific output payload entry + InputGetPayloadCount, // Retrieves the input payload count. + InputGetPayloadAtIndex, // Retrieves the GPU address for an input payload at the specified index + WorkgroupId, // Workgroup ID + GlobalThreadId, // Global Thread ID, + ShaderEmptyInputPreamble, // Empty input preamble + IncrementEmptyOutputCount, // Empty output count + InitCrossGroupSharing, // Init cross group sharing + FinishCrossGroupSharing, // Finish cross group sharing + IsOutputNodePresent, // Checks if an output node is valid + GetRemainingRecursionDepth, // Get remaining recursion depth + IsThreadLaunchInvocationValid, // Is ThreadLaunch Invocation Valid + Count +}; +} // namespace WorkGraphFunc + +const char *WorkGraphNames[] = { + "AmdWorkGraphsShaderPreamble", // ShaderPreamble + "AmdWorkGraphsShaderPostamble", // ShaderPostamble + "AmdWorkGraphsOutputAllocate", // OutputAllocate + "AmdWorkGraphsOutputCommit", // OutputCommit + "AmdWorkGraphsOutputGetPayload", // OutputGetPayload + "AmdWorkGraphsInputGetPayloadCount", // InputGetPayloadCount + "AmdWorkGraphsInputGetPayloadAtIndex", // InputGetPayloadAtIndex + "AmdWorkGraphsGroupId", // WorkgroupId + "AmdWorkGraphsGlobalThreadId", // GlobalThreadId + "AmdWorkGraphsShaderEmptyInputPreamble", // Empty input preamble + "AmdWorkGraphsIncrementEmptyOutputCount", // Empty output count + "AmdWorkGraphsInitCrossGroupSharing", // Init cross group sharing + "AmdWorkGraphsFinishCrossGroupSharing", // Finish cross group sharing + "AmdWorkGraphsIsOutputNodePresent", // Checks if an output node is valid + "AmdWorkGraphsGetRemainingRecursionDepth", // Current graphs recursion depth + "AmdWorkGraphsIsThreadLaunchInvocationValid", // Is ThreadLaunch Invocation Valid +}; + +static const char *OutputArgNames[] = {"ShaderState", "Scope", "OutputIdx", "ArrayIdx", "Count"}; +static const char *EntryFuncName = "shader"; // Execution graph entry name +const char *WorkgraphOutputCount = "WorkgraphOutputCount"; +const char *WorkgraphGetLds = "WorkgraphGetLds"; + +namespace { + +struct LoweringVisitorPayload { + Llpc::LowerExecutionGraph &pass; + TypeLowering typeLower; + + explicit LoweringVisitorPayload(Type *payloadArrayPtrType, Llpc::LowerExecutionGraph &pass) + : pass(pass), typeLower(payloadArrayPtrType->getContext()) { + typeLower.addRule([payloadArrayPtrType](TypeLowering &, Type * type) -> auto { + SmallVector lowered; + auto &context = type->getContext(); + if (type->isPointerTy() && type->getPointerAddressSpace() == SPIRAS_PayloadArray) { + lowered.push_back(PointerType::get(context, SPIRAS_Private)); + lowered.push_back(payloadArrayPtrType); + } else if (isPayloadType(type)) { + lowered.push_back(payloadArrayPtrType); + } + return lowered; + }); + typeLower.addConstantRule([](TypeLowering &, Constant * c, ArrayRef loweredTypes) -> auto { + SmallVector lowered; + if (auto *gv = dyn_cast(c)) { + if (gv->getAddressSpace() == SPIRAS_PayloadArray) { + // Stand-in for an input payload array. We don't actually need the value for anything. + lowered.push_back(PoisonValue::get(loweredTypes[0])); + } + } + return lowered; + }); + } +}; + +} // anonymous namespace + +template <> struct llvm_dialects::VisitorPayloadProjection { + static Llpc::LowerExecutionGraph &project(LoweringVisitorPayload &payload) { return payload.pass; } +}; + +LLVM_DIALECTS_VISITOR_PAYLOAD_PROJECT_FIELD(LoweringVisitorPayload, typeLower) + +namespace Llpc { + +static constexpr unsigned MaxGridCount = 65535; // Max dispatch grid count + +// ===================================================================================================================== +LowerExecutionGraph::LowerExecutionGraph(Pipeline *pipeline) + : m_pipeline(pipeline), m_graphLds(nullptr), m_threadLaunch(false) { + for (unsigned i = 0; i < WorkGraphFunc::Count; ++i) { + m_workGraphLibFuncNames[WorkGraphNames[i]] = i; + } +} + +// ===================================================================================================================== +// Executes this SPIR-V lowering pass on the specified LLVM module. +// +// @param [in/out] module : LLVM module to be run on (empty on entry) +// @param [in/out] analysisManager : Analysis manager to use for this transformation +PreservedAnalyses LowerExecutionGraph::run(Module &module, ModuleAnalysisManager &analysisManager) { + + LLVM_DEBUG(dbgs() << "Run the pass Lower-execution-graph\n"); + SpirvLower::init(&module); + + auto &graphContext = ExeGraphRuntimeContext::get(module.getContext()); + const auto graphLibModule = graphContext.theModule; + if (!graphLibModule) + return PreservedAnalyses::all(); + + m_graphLibFuncs.resize(WorkGraphFunc::Count); + for (unsigned i = 0; i < WorkGraphFunc::Count; ++i) { + Function *func = graphLibModule->getFunction(WorkGraphNames[i]); + m_graphLibFuncs[i] = func; + } + + m_payloadArrayPtrType = getOutputRecordsTy(); + + m_metaEnqueueId = m_context->getMDKindID(lgc::wg::ShaderEnqueue); + MDNode *modeMetadata = m_entryPoint->getMetadata(m_metaEnqueueId); + if (!modeMetadata) + return PreservedAnalyses::none(); + + m_entryPoint->setName(EntryFuncName); + m_entryPoint->setDLLStorageClass(GlobalValue::DLLExportStorageClass); + m_entryPoint->setLinkage(GlobalValue::ExternalLinkage); + + assert(modeMetadata->getNumOperands() == std::size(m_enqueueModes.U32All) + 1); // +1 for inputSharedWithName + unsigned ndx; + for (ndx = 0; ndx < std::size(m_enqueueModes.U32All); ++ndx) { + auto metaOp = cast(modeMetadata->getOperand(ndx)); + m_enqueueModes.U32All[ndx] = cast(metaOp->getValue())->getZExtValue(); + } + m_inputSharedWithName = cast(modeMetadata->getOperand(ndx))->getString(); + TypeLowering typeLower(*m_context); + m_typeLowering = &typeLower; + const auto funcVisitor = llvm_dialects::VisitorBuilder() + .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) + .add(&LowerExecutionGraph::visitRegisterOutputNode) + .build(); + funcVisitor.visit(*this, module); + m_typeLowering->finishCleanup(); + + initInputPayloadInfo(m_enqueueModes); + m_builder->SetInsertPointPastAllocas(m_entryPoint); + initAllocVariables(m_builder); + // Call ShaderPreamble + // NOTE: according to the PAL comment notes to the EmptyInputPreamble, for dynamic dispatch workgroup, implied by + // the MaxNumWorkgroupsAMDX is not zero, dynamic expansion nodes cannot have zero-byte payloads because the grid + // size is 12 bytes. + CrossModuleInliner inliner; + auto gprsVariable = + (m_inputPayloadInfo.payloadSize == 0 && m_enqueueModes.modes.maxNumWorkgroupsX == 0 && + m_enqueueModes.modes.maxNumWorkgroupsY == 0) + ? inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::ShaderEmptyInputPreamble], {}).returnValue + : inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::ShaderPreamble], {}).returnValue; + + // Keep the gprs variable from ShaderPreamble call + m_builder->CreateStore(gprsVariable, m_outputAllocateArgs[OutputAllocateArg::ShaderState]); + + // Create input counts number + auto inputsCount = inliner + .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::InputGetPayloadCount], + {m_outputAllocateArgs[OutputAllocateArg::ShaderState]}) + .returnValue; + m_builder->CreateStore(inputsCount, m_builtInVariables[WorkGraphBuiltIns::CoalescedInputCount]); + + auto remaining = inliner + .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::GetRemainingRecursionDepth], + {m_outputAllocateArgs[OutputAllocateArg::ShaderState]}) + .returnValue; + m_builder->CreateStore(remaining, m_builtInVariables[WorkGraphBuiltIns::RemainingRecursionLevels]); + + unsigned shaderIndex = + m_inputPayloadInfo.arrayIndex != InvalidValue ? m_inputPayloadInfo.arrayIndex : m_enqueueModes.modes.shaderIndex; + m_builder->CreateStore(m_builder->getInt32(shaderIndex), m_builtInVariables[WorkGraphBuiltIns::ShaderIndex]); + auto shaderMode = Pipeline::getComputeShaderMode(module); + m_threadLaunch = isThreadLaunchNode(shaderMode, m_enqueueModes, m_inputPayloadInfo); + auto zero = m_builder->getInt32(0); + auto constVec = ConstantVector::get({zero, zero, zero}); + + if (m_threadLaunch) { + auto valid = + cast(inliner + .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::IsThreadLaunchInvocationValid], + {m_outputAllocateArgs[OutputAllocateArg::ShaderState]}) + .returnValue); + auto nextPos = valid->getNextNode(); + Instruction *terminator = SplitBlockAndInsertIfElse(valid, m_builder->GetInsertPoint(), false); + m_builder->SetInsertPoint(terminator); + m_builder->CreateRetVoid(); + terminator->eraseFromParent(); + + m_builder->SetInsertPoint(nextPos); + m_localInvocationIndex = + new GlobalVariable(*m_module, zero->getType(), false, GlobalVariable::ExternalLinkage, nullptr, "localIndex", + nullptr, GlobalValue::NotThreadLocal, SPIRAS_Private); + m_builder->CreateStore(zero, m_localInvocationIndex); + shaderMode.workgroupSizeX = 32; + Pipeline::setComputeShaderMode(module, shaderMode); + } + if (m_enqueueModes.modes.isCoalescing) { + // Create WorkgroupId + m_builder->CreateStore(constVec, m_builtInVariables[WorkGraphBuiltIns::WorkgroupId]); + // Create GlobalInvocationId + Value *localInvocationId = + m_threadLaunch + ? constVec + : m_builder->CreateReadBuiltInInput(static_cast(lgc::BuiltInLocalInvocationId)); + m_builder->CreateStore(localInvocationId, m_builtInVariables[WorkGraphBuiltIns::GlobalInvocationId]); + + } else { + // Create WorkgroupId + auto workGroupId = inliner + .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::WorkgroupId], + {m_outputAllocateArgs[OutputAllocateArg::ShaderState]}) + .returnValue; + m_builder->CreateStore(workGroupId, m_builtInVariables[WorkGraphBuiltIns::WorkgroupId]); + + // Create GlobalInvocationId + auto globalInvocationId = inliner + .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::GlobalThreadId], + {m_outputAllocateArgs[OutputAllocateArg::ShaderState]}) + .returnValue; + m_builder->CreateStore(globalInvocationId, m_builtInVariables[WorkGraphBuiltIns::GlobalInvocationId]); + } + + SmallVector rets; + getFuncRets(m_entryPoint, rets); + for (auto ret : rets) { + m_builder->SetInsertPoint(ret); + inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::ShaderPostamble], + {m_outputAllocateArgs[OutputAllocateArg::ShaderState]}); + } + LoweringVisitorPayload payload(m_payloadArrayPtrType, *this); + m_typeLowering = &payload.typeLower; + static const auto visitor = llvm_dialects::VisitorBuilder() + .nest([](auto &b) { + b.add(&LowerExecutionGraph::visitLoad); + b.add(&LowerExecutionGraph::visitStore); + b.add(&LowerExecutionGraph::visitAlloca); + b.add(&LowerExecutionGraph::visitGetElementPtr); + b.add(&LowerExecutionGraph::visitIndexPayloadArray); + b.add(&LowerExecutionGraph::visitAllocateNodePayloads); + b.add(&LowerExecutionGraph::visitEnqueueNodePayloads); + b.add(&LowerExecutionGraph::visitPayloadArrayLength); + b.add(&LowerExecutionGraph::visitIsNodePayloadValid); + b.add(&LowerExecutionGraph::visitFinishWritingNodePayload); + }) + .nest(&TypeLowering::registerVisitors) + .build(); + + visitor.visit(payload, *m_module); + payload.typeLower.finishPhis(); + payload.typeLower.finishCleanup(); + m_typeLowering = nullptr; + buildExecGraphNodeMetadata(m_enqueueModes, m_inputPayloadInfo); + lowerGlobals(m_metaEnqueueId, m_context->getMDKindID(gSPIRVMD::InOut)); + unsigned outputCount = m_nodeNamesIdx.size(); + createGraphLds(outputCount); + // Post visit dialects after Workgraph library functions inlined + static const auto postVisitor = llvm_dialects::VisitorBuilder() + .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) + .add(&LowerExecutionGraph::visitGraphGetLds) + .add(&LowerExecutionGraph::visitOutputCount) + .build(); + postVisitor.visit(*this, *m_module); + + return PreservedAnalyses::none(); +} + +// ===================================================================================================================== +// Pre-parse to the RegisterOutputNodeOp to get number of node types/names, and setup m_nodeNamesIdx +// +// @param inst : the instruction to lower +void LowerExecutionGraph::visitRegisterOutputNode(lgc::wg::RegisterOutputNodeOp &inst) { + static const unsigned remappedScopes[3] = {WorkCreationScope::Workgroup, WorkCreationScope::Subgroup, + WorkCreationScope::Invocation}; + unsigned scope = inst.getScope(); + assert(scope == ScopeWorkgroup || scope == ScopeSubgroup || scope == ScopeInvocation); + unsigned remappedScope = remappedScopes[scope - ScopeWorkgroup]; + + auto payloadNameVar = cast(inst.getPayloadName()); + auto payloadName = cast(payloadNameVar->getInitializer())->getAsString(); + + unsigned payloadSize = inst.getPayloadSize(); + unsigned payloadMaxCount = inst.getPayloadMaxCount(); + unsigned payloadId = inst.getPayloadId(); + unsigned limitsSharedWith = inst.getLimitsSharedWith(); + bool trackFinishWriting = inst.getTrackFinishWriting(); + unsigned payloadArrayTyId = inst.getArrayTypeId(); + + auto nameIter = m_nodeNamesIdx.find(payloadName); + if (nameIter == m_nodeNamesIdx.end()) { + m_nodeNamesIdx[payloadName] = {payloadMaxCount, payloadSize, payloadId, + limitsSharedWith, remappedScope, trackFinishWriting, + inst.getArraySize(), payloadArrayTyId, 0}; + nameIter = m_nodeNamesIdx.find(payloadName); + } else { + // Add up the payloadMaxCount for the same output node + nameIter->second.payloadCount += payloadMaxCount; + nameIter->second.payloadSize = std::max(nameIter->second.payloadSize, payloadSize); + } + m_typeLowering->eraseInstruction(&inst); +} + +// ===================================================================================================================== +// Lower an allocate.node.payloads op +// +// @param inst : the instruction to lower +void LowerExecutionGraph::visitAllocateNodePayloads(lgc::wg::AllocateNodePayloadsOp &inst) { + m_builder->SetInsertPoint(&inst); + auto payloadNameVar = cast(inst.getPayloadName()); + auto payloadName = cast(payloadNameVar->getInitializer())->getAsString(); + auto nameIter = m_nodeNamesIdx.find(payloadName); + + auto baseIndex = inst.getBaseIndex(); + + m_builder->CreateStore(m_builder->getInt32(nameIter->second.scope), m_outputAllocateArgs[OutputAllocateArg::Scope]); + + // MapVector will keep the index of insertion, so the OutputIndex would be index of Output payload nodes names array. + // Each array member must have a unique node name, array index is the specific shader in that array. + auto OutputIndex = nameIter - m_nodeNamesIdx.begin(); + m_builder->CreateStore(m_builder->getInt32(OutputIndex), m_outputAllocateArgs[OutputAllocateArg::OutputIdx]); + + m_builder->CreateStore(inst.getPayloadCount(), m_outputAllocateArgs[OutputAllocateArg::Count]); + + Value *nodeIdx = inst.getNodeIndex(); + nodeIdx = m_builder->CreateAdd(nodeIdx, baseIndex); + const bool recursiveNode = + (m_enqueueModes.modes.maxNodeRecursion > 0) && (payloadName == m_inputPayloadInfo.nodeName); + if (recursiveNode) { + // NOTE: Always needs to be 0 for recursive calls since recursive output ports always have an + // array index offset equal to the parent. No need to check the array index provided by the + // app since the only legal case is self-recursion (the node calling itself, same name, same index). + nodeIdx = m_builder->getInt32(0); + } + m_builder->CreateStore(nodeIdx, m_outputAllocateArgs[OutputAllocateArg::ArrayIdx]); + + // Call OutputAllocate + SmallVector args; + for (auto arg : m_outputAllocateArgs) { + args.push_back(arg); + } + CrossModuleInliner inliner; + Value *outputRecords = nullptr; + if (nameIter->second.payloadSize == 0) { + inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::IncrementEmptyOutputCount], args); + outputRecords = PoisonValue::get(getOutputRecordsTy()); + } else { + outputRecords = inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::OutputAllocate], args).returnValue; + } + auto dummyValue = ConstantPointerNull::get(PointerType::get(*m_context, SPIRAS_Private)); + m_typeLowering->replaceInstruction(&inst, {dummyValue, outputRecords}); +} + +// ===================================================================================================================== +// Lower an enqueue.node.payloads op +// +// @param inst : the instruction to lower +void LowerExecutionGraph::visitEnqueueNodePayloads(lgc::wg::EnqueueNodePayloadsOp &inst) { + m_builder->SetInsertPoint(&inst); + auto *payloadArrayPtr = m_typeLowering->getValue(inst.getPayloads())[0]; + auto payloadNameVar = cast(inst.getPayloadName()); + auto payloadName = cast(payloadNameVar->getInitializer())->getAsString(); + + auto nameIter = m_nodeNamesIdx.find(payloadName); + assert(nameIter != m_nodeNamesIdx.end()); + m_builder->CreateStore(m_builder->getInt32(nameIter->second.scope), m_outputAllocateArgs[OutputAllocateArg::Scope]); + + SmallVector args = {m_outputAllocateArgs[OutputAllocateArg::ShaderState], + m_outputAllocateArgs[OutputAllocateArg::Scope], payloadArrayPtr}; + CrossModuleInliner inliner; + if (nameIter->second.trackFinishWriting) { + inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::InitCrossGroupSharing], args); + } + + inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::OutputCommit], args); + + m_typeLowering->eraseInstruction(&inst); + m_funcsToLower.insert(inst.getCalledFunction()); +} + +// ===================================================================================================================== +// Lower a finish.writing.node.payload op +// +// @param inst : the instruction to lower +void LowerExecutionGraph::visitFinishWritingNodePayload(wg::FinishWritingNodePayloadOp &inst) { + m_builder->SetInsertPoint(&inst); + CrossModuleInliner inliner; + inst.replaceAllUsesWith(inliner + .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::FinishCrossGroupSharing], + {m_outputAllocateArgs[OutputAllocateArg::ShaderState]}) + .returnValue); + + m_typeLowering->eraseInstruction(&inst); + m_funcsToLower.insert(inst.getCalledFunction()); +} + +// ===================================================================================================================== +// Lower a payload.array.length op +// +// @param inst : the instruction to lower +void LowerExecutionGraph::visitPayloadArrayLength(wg::PayloadArrayLengthOp &inst) { + m_builder->SetInsertPoint(&inst); + Value *nodeCount = nullptr; + if (inst.getInput()) { + nodeCount = + m_builder->CreateLoad(m_builder->getInt32Ty(), m_builtInVariables[WorkGraphBuiltIns::CoalescedInputCount]); + } else { + // Output variable + auto *payloadArrayPtr = m_typeLowering->getValue(inst.getPayloads())[0]; + Value *args[] = {m_builder->getInt32(0), m_builder->getInt32(4)}; + nodeCount = m_builder->CreateGEP(m_payloadArrayPtrType, payloadArrayPtr, args); + nodeCount = m_builder->CreateLoad(m_builder->getInt32Ty(), nodeCount); + } + inst.replaceAllUsesWith(nodeCount); + m_typeLowering->eraseInstruction(&inst); + m_funcsToLower.insert(inst.getCalledFunction()); +} + +// ===================================================================================================================== +// Lower a LoadInst instruction +// +// @param inst : the instruction to lower +VisitorResult LowerExecutionGraph::visitLoad(LoadInst &inst) { + m_builder->SetInsertPoint(&inst); + if (inst.getPointerOperandType()->getPointerAddressSpace() == SPIRAS_PayloadArray) { + Value *outputRecord = m_typeLowering->getValue(inst.getPointerOperand())[1]; + m_typeLowering->replaceInstruction(&inst, outputRecord); + } + return VisitorResult::Stop; +} + +// ===================================================================================================================== +// Lower a StoreInst instruction +// +// @param inst : the instruction to lower +VisitorResult LowerExecutionGraph::visitStore(StoreInst &inst) { + m_builder->SetInsertPoint(&inst); + if (inst.getPointerOperandType()->getPointerAddressSpace() == SPIRAS_PayloadArray) { + auto ptrOperand = inst.getPointerOperand(); + Value *newPtrOperand = m_typeLowering->getValue(ptrOperand)[0]; + Value *newVal = m_typeLowering->getValue(inst.getValueOperand())[0]; + m_builder->CreateStore(newVal, newPtrOperand, inst.isVolatile()); + m_typeLowering->eraseInstruction(&inst); + } + return VisitorResult::Stop; +} + +// ===================================================================================================================== +// Lower an AllocInst +// +// @param inst : the instruction to lower +VisitorResult LowerExecutionGraph::visitAlloca(AllocaInst &inst) { + m_builder->SetInsertPoint(&inst); + if (inst.getAddressSpace() == SPIRAS_PayloadArray) { + Type *allocTy = replacePayloadType(inst.getAllocatedType()); + auto newAlloc = m_builder->CreateAlloca(allocTy); + auto dummyValue = PoisonValue::get(m_payloadArrayPtrType); + m_typeLowering->replaceInstruction(&inst, {newAlloc, dummyValue}); + } + return VisitorResult::Stop; +} + +// ===================================================================================================================== +// Lower a GetElementPtrInst +// +// @param inst : the instruction to lower +VisitorResult LowerExecutionGraph::visitGetElementPtr(GetElementPtrInst &inst) { + m_builder->SetInsertPoint(&inst); + if (inst.getAddressSpace() == SPIRAS_PayloadArray) { + Type *gepTy = replacePayloadType(inst.getSourceElementType()); + Value *srcElement = m_typeLowering->getValue(inst.getPointerOperand())[0]; + Value *newGep = nullptr; + SmallVector indices(inst.idx_begin(), inst.idx_end()); + if (inst.isInBounds()) + newGep = m_builder->CreateInBoundsGEP(gepTy, srcElement, indices); + else + newGep = m_builder->CreateGEP(gepTy, srcElement, indices); + auto dummyValue = PoisonValue::get(m_payloadArrayPtrType); + m_typeLowering->replaceInstruction(&inst, {newGep, dummyValue}); + } + return VisitorResult::Stop; +} + +// ===================================================================================================================== +// Recursive replace {} to the OutputRecordType in the aggregation type +// +// @param ty : The type to replace +Type *LowerExecutionGraph::replacePayloadType(Type *ty) { + if (isPayloadType(ty)) { + return m_payloadArrayPtrType; + } else if (ty->isStructTy()) { + SmallVector elemTys; + for (unsigned i = 0; i < ty->getStructNumElements(); ++i) + elemTys.push_back(replacePayloadType(ty->getStructElementType(i))); + return StructType::get(*m_context, elemTys); + } else if (ty->isArrayTy()) { + return ArrayType::get(replacePayloadType(ty->getArrayElementType()), ty->getArrayNumElements()); + } else + return ty; +} + +// ===================================================================================================================== +// Lower an is.node.payload.valid +// +// @param inst : the instruction to lower +void LowerExecutionGraph::visitIsNodePayloadValid(wg::IsNodePayloadValidOp &inst) { + m_builder->SetInsertPoint(&inst); + auto payloadNameVar = cast(inst.getPayloadName()); + auto payloadName = cast(payloadNameVar->getInitializer())->getAsString(); + auto nameIter = m_nodeNamesIdx.find(payloadName); + auto OutputIndex = nameIter - m_nodeNamesIdx.begin(); + m_builder->CreateStore(m_builder->getInt32(OutputIndex), m_outputAllocateArgs[OutputAllocateArg::OutputIdx]); + m_builder->CreateStore(inst.getNodeIndex(), m_outputAllocateArgs[OutputAllocateArg::ArrayIdx]); + Value *args[] = {m_outputAllocateArgs[OutputAllocateArg::ShaderState], + m_outputAllocateArgs[OutputAllocateArg::OutputIdx], + m_outputAllocateArgs[OutputAllocateArg::ArrayIdx]}; + CrossModuleInliner inliner; + Value *isValid = + inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::IsOutputNodePresent], args).returnValue; + inst.replaceAllUsesWith(isValid); + + m_typeLowering->eraseInstruction(&inst); + m_funcsToLower.insert(inst.getCalledFunction()); +} + +// ===================================================================================================================== +// Create global variables +// +// @param builder : The builder to create variable +void LowerExecutionGraph::initAllocVariables(lgc::Builder *builder) { + Type *tys[] = {getShaderStateTy(), m_builder->getInt32Ty(), m_builder->getInt32Ty(), m_builder->getInt32Ty(), + m_builder->getInt32Ty()}; + + for (unsigned i = 0; i < m_outputAllocateArgs.size(); ++i) { + m_outputAllocateArgs[i] = m_builder->CreateAlloca(tys[i], nullptr, Twine(OutputArgNames[i])); + } + m_tempVariable = m_builder->CreateAlloca(m_builder->getInt32Ty(), nullptr, Twine("tempVariable")); + auto int32x3Ty = FixedVectorType::get(m_builder->getInt32Ty(), 3); + Type *builtInTys[] = {m_builder->getInt32Ty(), int32x3Ty, int32x3Ty, m_builder->getInt32Ty(), m_builder->getInt32Ty(), + m_builder->getInt32Ty()}; + + for (unsigned i = 0; i < WorkGraphBuiltIns::Count; ++i) { + m_builtInVariables[i] = + new GlobalVariable(*m_module, builtInTys[i], false, GlobalVariable::ExternalLinkage, nullptr, + Twine("builtIn") + std::to_string(i), nullptr, GlobalValue::NotThreadLocal, SPIRAS_Private); + } +} + +// ===================================================================================================================== +// Get AmdWorkGraphsShaderState type +Type *LowerExecutionGraph::getShaderStateTy() { + return m_graphLibFuncs[WorkGraphFunc::ShaderPreamble]->getReturnType(); +} + +// ===================================================================================================================== +// Get OutputRecords type +Type *LowerExecutionGraph::getOutputRecordsTy() { + return m_graphLibFuncs[WorkGraphFunc::OutputAllocate]->getReturnType(); +} + +// ===================================================================================================================== +// Get all the function ReturnInst +// +// @param func : Function to gather ReturnInst +// @param rets : Returned vector of ReturnInst instructions +void LowerExecutionGraph::getFuncRets(Function *func, SmallVector &rets) { + for (auto &block : *func) { + auto blockTerm = block.getTerminator(); + if (blockTerm != nullptr && isa(blockTerm)) + rets.push_back(blockTerm); + } +} + +// ===================================================================================================================== +// Lower the builtin and workgraph global variables +// +// @param enqueueMetaId : Metadata for the workgraph variables +// @param inoutMetaId : Metadata for the built-in variables +void LowerExecutionGraph::lowerGlobals(unsigned enqueueMetaId, unsigned inoutMetaId) { + for (Function *func : m_funcsToLower) { + func->dropAllReferences(); + func->eraseFromParent(); + } + + SmallVector geps; + for (auto globalIt = m_module->global_begin(), end = m_module->global_end(); globalIt != end;) { + GlobalVariable *global = &*globalIt++; + auto meta = global->getMetadata(enqueueMetaId); + if (meta != nullptr) { + global->eraseFromParent(); + } else if ((meta = global->getMetadata(inoutMetaId)) != nullptr) { + processBuiltinGlobals(global, meta); + } + } +} + +// ===================================================================================================================== +// Lower the built-in global variables +// +// @param global : Global variables to lower +// @param metadata : Metadata for the built-in variables +void LowerExecutionGraph::processBuiltinGlobals(GlobalVariable *global, MDNode *metadata) { + auto meta = mdconst::dyn_extract(metadata->getOperand(0)); + unsigned startOperand = 0; + Type *globalTy = global->getValueType(); + if (globalTy->isArrayTy()) { + assert(meta->getNumOperands() == 4); + startOperand += 2; + } + ShaderInOutMetadata inputMeta = {}; + inputMeta.U64All[0] = cast(meta->getOperand(startOperand))->getZExtValue(); + inputMeta.U64All[1] = cast(meta->getOperand(startOperand + 1))->getZExtValue(); + llvm::GlobalVariable *replacement = nullptr; + switch (inputMeta.Value) { + case spv::BuiltInWorkgroupId: + replacement = m_builtInVariables[WorkGraphBuiltIns::WorkgroupId]; + break; + case spv::BuiltInGlobalInvocationId: + replacement = m_builtInVariables[WorkGraphBuiltIns::GlobalInvocationId]; + break; + case spv::BuiltInLocalInvocationId: + case spv::BuiltInLocalInvocationIndex: { + if (!m_threadLaunch) + return; + replacement = inputMeta.Value == spv::BuiltInLocalInvocationId + ? m_builtInVariables[WorkGraphBuiltIns::GlobalInvocationId] + : m_localInvocationIndex; + break; + } + case spv::BuiltInShaderIndexAMDX: + replacement = m_builtInVariables[WorkGraphBuiltIns::ShaderIndex]; + break; + case spv::BuiltInRemainingRecursionLevelsAMDX: { + replacement = m_builtInVariables[WorkGraphBuiltIns::RemainingRecursionLevels]; + } break; + default: + // For other builtin Globals, return + return; + } + global->mutateType(replacement->getType()); + replaceGlobal(m_context, global, replacement); +} + +// ===================================================================================================================== +// Fill m_inputPayloadInfo with payload metadata and ShaderEnqueue mode +// +// @param enqueueModes : Workgraph shader enqueue modes +void LowerExecutionGraph::initInputPayloadInfo(const lgc::wg::ShaderEnqueueMode &enqueueModes) { + m_inputPayloadInfo = {"", InvalidValue, 0, 0, false, InvalidValue, InvalidValue, InvalidValue, InvalidValue}; + auto moduleMetadata = m_module->getNamedMetadata(lgc::wg::ShaderEnqueue); + MDNode *payloadMeta = moduleMetadata->getOperand(moduleMetadata->getNumOperands() - 1); + m_inputPayloadInfo.nodeName = cast(payloadMeta->getOperand(0))->getString(); + auto arrayIndexMeta = cast(payloadMeta->getOperand(1)); + m_inputPayloadInfo.arrayIndex = cast(arrayIndexMeta->getValue())->getZExtValue(); + + if (moduleMetadata->getNumOperands() > 1) { + payloadMeta = moduleMetadata->getOperand(0); + auto maxPayloadMeta = cast(payloadMeta->getOperand(0)); + m_inputPayloadInfo.payloadCount = cast(maxPayloadMeta->getValue())->getZExtValue(); + auto payloadSizeMeta = cast(payloadMeta->getOperand(1)); + m_inputPayloadInfo.payloadSize = cast(payloadSizeMeta->getValue())->getZExtValue(); + auto trackFinishWritingMeta = cast(payloadMeta->getOperand(2)); + m_inputPayloadInfo.trackFinishWriting = cast(trackFinishWritingMeta->getValue())->isOne(); + auto dynamicDispatchMeta = cast(payloadMeta->getOperand(3)); + m_inputPayloadInfo.dynamicDispatch = cast(dynamicDispatchMeta->getValue())->getZExtValue(); + auto nodeTypeMeta = cast(payloadMeta->getOperand(4)); + m_inputPayloadInfo.nodeType = cast(nodeTypeMeta->getValue())->getZExtValue(); + auto vbTableOffsetMeta = cast(payloadMeta->getOperand(5)); + m_inputPayloadInfo.vbTableOffset = cast(vbTableOffsetMeta->getValue())->getZExtValue(); + auto indexBufferOffsetMeta = cast(payloadMeta->getOperand(6)); + m_inputPayloadInfo.indexBufferOffset = cast(indexBufferOffsetMeta->getValue())->getZExtValue(); + } +} + +// ===================================================================================================================== +// Build the ExecutionGraph PAL metadata +// +// @param enqueueModes : ShaderEnqueueMode mode +// @param payloads : Payload size and count +void LowerExecutionGraph::buildExecGraphNodeMetadata(const ShaderEnqueueMode &enqueueModes, + const InputPayloadInfo &payloads) { + + lgc::GraphNodeMetadata graphNodeMeta = {}; + graphNodeMeta.payloadMaxCount = payloads.payloadCount; + graphNodeMeta.payloadSize = payloads.payloadSize; + graphNodeMeta.maxRecursionDepth = enqueueModes.modes.maxNodeRecursion; + graphNodeMeta.node.name = payloads.nodeName; + graphNodeMeta.node.arrayIndex = + payloads.arrayIndex != InvalidValue ? payloads.arrayIndex : enqueueModes.modes.shaderIndex; + graphNodeMeta.inputSharedWith.name = m_inputSharedWithName; + graphNodeMeta.inputSharedWith.arrayIndex = enqueueModes.modes.inputSharedWithArrayIndex; + graphNodeMeta.payloadFlags.crossGroupSharing = payloads.trackFinishWriting; + + if (payloads.dynamicDispatch != InvalidValue) { + graphNodeMeta.dynamicDispatchGrid.componentCount = payloads.dynamicDispatch >> 24; + graphNodeMeta.dynamicDispatchGrid.bitsPerComponent = (payloads.dynamicDispatch >> 16) & 0xff; + graphNodeMeta.dynamicDispatchGrid.offset = payloads.dynamicDispatch & 0xffff; + } else { + graphNodeMeta.dynamicDispatchGrid.componentCount = 3; + graphNodeMeta.dynamicDispatchGrid.bitsPerComponent = (sizeof(unsigned) << 3); + graphNodeMeta.dynamicDispatchGrid.offset = 0; + } + + graphNodeMeta.outputs.resize(m_nodeNamesIdx.size()); + unsigned outIdx = 0; + for (auto &nodeName : m_nodeNamesIdx) { + NodeShaderOutputInfo &outputInfo = graphNodeMeta.outputs[outIdx++]; + + bool recursiveNode = (enqueueModes.modes.maxNodeRecursion > 0) && (nodeName.first == graphNodeMeta.node.name); + outputInfo.node.arrayIndex = recursiveNode ? graphNodeMeta.node.arrayIndex : 0; + outputInfo.arrayCount = recursiveNode ? 1 : UINT_MAX; + // NOTE: It is a workaround of test issue; revisit once the spec has been updated + outputInfo.payloadMaxCount = std::min(nodeName.second.payloadCount, 256u); + outputInfo.payloadSize = nodeName.second.payloadSize; + outputInfo.payloadFlags.crossGroupSharing = nodeName.second.trackFinishWriting; + // Copy name + outputInfo.node.name = nodeName.first.str(); + + bool validPayloadIdToShare = nodeName.second.limitSharedWith != InvalidValue; + outputInfo.budgetSharedWith.enable = validPayloadIdToShare; + outputInfo.budgetSharedWith.index = validPayloadIdToShare ? getOutputIndex(nodeName.second.limitSharedWith) : 0; + } + + // Determine the graph node type + // If static dispatch size is provided -> Fixed expansion + // If coalescing mode is provided -> Coalescing + // Otherwise -> Dynamic expansion + if (enqueueModes.modes.staticNumWorkgroupsX != 0) { + assert(enqueueModes.modes.staticNumWorkgroupsX != 0 && enqueueModes.modes.staticNumWorkgroupsY != 0 && + enqueueModes.modes.staticNumWorkgroupsZ != 0); + assert(enqueueModes.modes.maxNumWorkgroupsX == 0 && enqueueModes.modes.maxNumWorkgroupsY == 0 && + enqueueModes.modes.maxNumWorkgroupsZ == 0); + assert(enqueueModes.modes.isCoalescing == false); + graphNodeMeta.nodeType = GraphNodeTypeFixedExpansion; + + graphNodeMeta.dispatchGridX = enqueueModes.modes.staticNumWorkgroupsX; + graphNodeMeta.dispatchGridY = enqueueModes.modes.staticNumWorkgroupsY; + graphNodeMeta.dispatchGridZ = enqueueModes.modes.staticNumWorkgroupsZ; + } else if (enqueueModes.modes.isCoalescing) { + assert(enqueueModes.modes.staticNumWorkgroupsX == 0 && enqueueModes.modes.staticNumWorkgroupsY == 0 && + enqueueModes.modes.staticNumWorkgroupsZ == 0); + assert(enqueueModes.modes.maxNumWorkgroupsX == 0 && enqueueModes.modes.maxNumWorkgroupsY == 0 && + enqueueModes.modes.maxNumWorkgroupsZ == 0); + graphNodeMeta.nodeType = m_threadLaunch ? GraphNodeTypeThreadLaunch : GraphNodeTypeCoalescing; + } else { + assert(enqueueModes.modes.staticNumWorkgroupsX == 0 && enqueueModes.modes.staticNumWorkgroupsY == 0 && + enqueueModes.modes.staticNumWorkgroupsZ == 0); + assert(enqueueModes.modes.isCoalescing == false); + graphNodeMeta.nodeType = GraphNodeTypeDynamicExpansion; + graphNodeMeta.dispatchGridX = enqueueModes.modes.maxNumWorkgroupsX; + graphNodeMeta.dispatchGridY = enqueueModes.modes.maxNumWorkgroupsY; + graphNodeMeta.dispatchGridZ = enqueueModes.modes.maxNumWorkgroupsZ; + // Payload not explicitly declared, but it must exist and contain at least the dispatch size + if (graphNodeMeta.payloadSize == 0) { + graphNodeMeta.payloadSize = 12; + graphNodeMeta.payloadMaxCount = 1; + } + + // The shader didn't provide MaxNumWorkgroupsAMDX, fall back to the max limit + if (graphNodeMeta.dispatchGridX == 0) { + graphNodeMeta.dispatchGridX = MaxGridCount; + graphNodeMeta.dispatchGridY = MaxGridCount; + graphNodeMeta.dispatchGridZ = MaxGridCount; + } + } + + // Affects PatchPreparePipelineAbi::setAbiEntryNames() for compute shaders. + m_pipeline->setGraphMetadata(graphNodeMeta); +} + +// ===================================================================================================================== +// Get output node index + +// @param payloadId : Output payload id +unsigned LowerExecutionGraph::getOutputIndex(unsigned id) { + unsigned outIdx = 0; + for (auto &nodeName : m_nodeNamesIdx) { + // The SPIR-V spec expects the decoration to refer to an array type's id. + // String name's id is a fallback for glslang compatibility. + if ((nodeName.second.arrayTypeId == id) || (nodeName.second.payloadId == id)) + return outIdx; + outIdx++; + } + llvm_unreachable("Should find payloadId"); + return outIdx; +} + +// ===================================================================================================================== +// Lower dialect IndexPayloadArrayOp +// +// @param [in] inst : IndexPayloadArrayOp to lower +void LowerExecutionGraph::visitIndexPayloadArray(lgc::wg::IndexPayloadArrayOp &inst) { + m_builder->SetInsertPoint(&inst); + CrossModuleInliner inliner; + Value *payloadAddr = nullptr; + bool isInput = cast(inst.getInput())->isOne(); + if (isInput) { + Value *indexValue = + m_threadLaunch ? m_builder->CreateReadBuiltInInput(lgc::BuiltInLocalInvocationIndex) : inst.getIndex(); + m_builder->CreateStore(indexValue, m_tempVariable); + + Value *args[] = {m_outputAllocateArgs[OutputAllocateArg::ShaderState], m_tempVariable}; + payloadAddr = + inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::InputGetPayloadAtIndex], args).returnValue; + } else { + m_builder->CreateStore(inst.getIndex(), m_tempVariable); + auto payloadArray = m_typeLowering->getValue(inst.getPayloadArray())[0]; + Value *args[] = {payloadArray, m_tempVariable}; + payloadAddr = inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::OutputGetPayload], args).returnValue; + } + payloadAddr = m_builder->CreateIntToPtr(payloadAddr, PointerType::get(*m_context, SPIRAS_Global)); + // TODO: currently recursive set GEP chain load/store as volatile to make payload access + // coherent, aka, load glc/dlc. + // correctly represent memory model semantics once backend is ready + std::function setLoadStore = [&](Value *nodearray) { + for (Use &use : nodearray->uses()) { + Instruction *chainedUser = cast(use.getUser()); + if (auto loadInst = dyn_cast(chainedUser)) { + loadInst->setVolatile(true); + } else if (auto storeInst = dyn_cast(chainedUser)) { + storeInst->setVolatile(true); + } else { + auto gepInst = cast(chainedUser); + gepInst->mutateType(nodearray->getType()); + setLoadStore(gepInst); + } + } + }; + + setLoadStore(&inst); + inst.replaceAllUsesWith(payloadAddr); + m_typeLowering->eraseInstruction(&inst); + m_funcsToLower.insert(inst.getCalledFunction()); +} + +// ===================================================================================================================== +// Get input payload + +// @param enqueueMetaId : shader enqueue metadata ID +GlobalVariable *LowerExecutionGraph::getInputPayload(unsigned enqueueMetaId) { + for (auto &global : m_module->globals()) { + if (global.getMetadata(enqueueMetaId)) { + return &global; + } + } + return nullptr; +} + +// ===================================================================================================================== +// Is thread node + +// @param shaderMode : compute shader mode +// @param enqueueModes : enqueue mode +// @param payload : payload +bool LowerExecutionGraph::isThreadLaunchNode(const lgc::ComputeShaderMode &shaderMode, + const ShaderEnqueueMode &enqueueModes, const InputPayloadInfo &payloads) { + + // Workgroup size is 1, 1, 1 + bool threadLaunch = shaderMode.workgroupSizeX == 1; + threadLaunch = threadLaunch && (shaderMode.workgroupSizeY == 1); + threadLaunch = threadLaunch && (shaderMode.workgroupSizeZ == 1); + // Must be coalescing node. + threadLaunch = threadLaunch && enqueueModes.modes.isCoalescing; + // If there is input payload, then input payload count is 1 + threadLaunch = threadLaunch && (payloads.payloadCount <= 1); + + // Less than 8 allocation nodes + threadLaunch = threadLaunch && (m_nodeNamesIdx.size() < 8); + // Compute shader does not use lds + for (auto &global : m_module->globals()) { + if (global.getAddressSpace() == SPIRAS_Local) { + threadLaunch = false; + break; + } + } + return threadLaunch; +} + +// ===================================================================================================================== +// Create Lds memory for the output graph nodes + +// @param outputCount : Number of node output +void LowerExecutionGraph::createGraphLds(unsigned outputCount) { + if (m_graphLds == nullptr) { + // - base_wptr_transfer + // - last_group_transfer + // - allocation_counts[num_outputs] + auto ldsSize = outputCount + 2; + auto ldsTy = ArrayType::get(m_builder->getInt32Ty(), ldsSize); + m_graphLds = new GlobalVariable(*m_module, ldsTy, false, GlobalValue::ExternalLinkage, nullptr, "GraphLds", nullptr, + GlobalValue::NotThreadLocal, SPIRAS_Local); + } +} + +// ===================================================================================================================== +// Create OutputCountOp used for the execution graph library +// +// @param [in] inst : OutputCountOp to lower +void LowerExecutionGraph::visitOutputCount(wg::OutputCountOp &inst) { + m_builder->SetInsertPoint(&inst); + auto outputCount = m_builder->getInt32(m_nodeNamesIdx.size()); + inst.replaceAllUsesWith(outputCount); +} + +// ===================================================================================================================== +// Visit GraphgetLdsOp used for the execution graph library +// +// @param [in] inst : GraphGetLdsOp to lower +void LowerExecutionGraph::visitGraphGetLds(wg::GraphGetLdsOp &inst) { + auto retTy = PointerType::get(m_builder->getInt32Ty(), SPIRAS_Local); + m_builder->SetInsertPoint(&inst); + assert(m_graphLds != nullptr); + auto ldsPtr = m_builder->CreateGEP(m_builder->getInt32Ty(), m_graphLds, m_builder->getInt32(0)); + ldsPtr = m_builder->CreateBitCast(ldsPtr, retTy); + inst.replaceAllUsesWith(ldsPtr); +} + +} // namespace Llpc diff --git a/llpc/lowering/LowerExecutionGraph.h b/llpc/lowering/LowerExecutionGraph.h new file mode 100644 index 0000000000..6ba637385b --- /dev/null +++ b/llpc/lowering/LowerExecutionGraph.h @@ -0,0 +1,157 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file LowerExecutionGraph.h + * @brief LLPC header file: contains declaration of Llpc::LowerExecutionGraph + *********************************************************************************************************************** + */ +#pragma once + +#include "Lowering.h" +#include "SPIRVInternal.h" +#include "lgc/LgcWgDialect.h" +#include "llvm-dialects/Dialect/Visitor.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/IR/PassManager.h" + +namespace CompilerUtils { +class TypeLowering; +} // namespace CompilerUtils + +namespace lgc { +class Pipeline; +struct ComputeShaderMode; +} // namespace lgc + +namespace Llpc { + +namespace WorkCreationScope { +enum : unsigned { + Invocation = 0, // WorkCreation library invocation scope + Workgroup = 1, // WorkCreation library workgroup scope + Subgroup = 2 // WorkCreation library subgroup scope +}; +} + +namespace WorkGraphBuiltIns { +enum : unsigned { + CoalescedInputCount = 0, // SPIRV CoalescedInputCount + WorkgroupId, // SPIRV WorkgroupId + GlobalInvocationId, // SPIRV GlobalInvocationId + ShaderIndex, // SPIRV ShaderIndex + RemainingRecursionLevels, // SPIRV RemainingRecursionLevels + LocalInvocationIndex, // SPIRV GlobalInvocationId + Count +}; +} + +namespace OutputAllocateArg { +enum : unsigned { ShaderState = 0, Scope, OutputIdx, ArrayIdx, Count }; +} + +// ===================================================================================================================== +// Represents the pass of SPIR-V lowering shader enqueue opcode +class LowerExecutionGraph : public SpirvLower, public llvm::PassInfoMixin { + + struct OutputPayloadInfo { + unsigned payloadCount; // Payload Count + unsigned payloadSize; // Payload Size + unsigned payloadId; // Payload id + unsigned limitSharedWith; // payload id to share with limit + unsigned scope; // created scope + bool trackFinishWriting; // Whether this payload need to track finish writing + unsigned arraySize; // Payload array size + unsigned arrayTypeId; // Payload array type's id + unsigned dynamicDispatch; // DynamicDispatch; + }; + + struct InputPayloadInfo { + llvm::StringRef nodeName; // node name + unsigned arrayIndex; // array Index + unsigned payloadCount; // Payload Count + unsigned payloadSize; // Payload Size + bool trackFinishWriting; // Track finish + unsigned dynamicDispatch; // DynamicDispatch + unsigned nodeType; // Node type + unsigned vbTableOffset; // vertex buffer table offset + unsigned indexBufferOffset; // index buffer table offset + }; + +public: + LowerExecutionGraph(lgc::Pipeline *pipeline); + llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); + static llvm::StringRef name() { return "Lower SPIR-V execution graph node shader"; } + +private: + void initAllocVariables(lgc::Builder *builder); + typedef void (LowerExecutionGraph::*LibraryFuncPtr)(llvm::Function *, unsigned); + llvm::Type *getShaderStateTy(); + llvm::Type *getOutputRecordsTy(); + void getFuncRets(llvm::Function *func, llvm::SmallVector &rets); + void lowerGlobals(unsigned enqueueMetaId, unsigned inoutMetaId); + void processBuiltinGlobals(llvm::GlobalVariable *global, llvm::MDNode *mdata); + void buildExecGraphNodeMetadata(const lgc::wg::ShaderEnqueueMode &enqueueModes, const InputPayloadInfo &payloads); + void initInputPayloadInfo(const lgc::wg::ShaderEnqueueMode &enqueueModes); + llvm::GlobalVariable *getInputPayload(unsigned enqueueMetaId); + void createGraphLds(unsigned outputCount); + unsigned getOutputIndex(unsigned payloadId); + void visitIndexPayloadArray(lgc::wg::IndexPayloadArrayOp &inst); + void visitAllocateNodePayloads(lgc::wg::AllocateNodePayloadsOp &inst); + void visitRegisterOutputNode(lgc::wg::RegisterOutputNodeOp &inst); + void visitEnqueueNodePayloads(lgc::wg::EnqueueNodePayloadsOp &inst); + void visitPayloadArrayLength(lgc::wg::PayloadArrayLengthOp &inst); + void visitIsNodePayloadValid(lgc::wg::IsNodePayloadValidOp &inst); + void visitFinishWritingNodePayload(lgc::wg::FinishWritingNodePayloadOp &inst); + void visitGraphGetLds(lgc::wg::GraphGetLdsOp &inst); + void visitOutputCount(lgc::wg::OutputCountOp &inst); + llvm_dialects::VisitorResult visitLoad(LoadInst &load); + llvm_dialects::VisitorResult visitAlloca(AllocaInst &alloca); + llvm_dialects::VisitorResult visitStore(StoreInst &store); + llvm_dialects::VisitorResult visitGetElementPtr(GetElementPtrInst &gep); + Type *replacePayloadType(Type *ty); + bool isThreadLaunchNode(const lgc::ComputeShaderMode &shaderMode, const lgc::wg::ShaderEnqueueMode &enqueueModes, + const InputPayloadInfo &payloads); + std::array m_outputAllocateArgs; + llvm::Value *m_tempVariable; + llvm::GlobalVariable *m_localInvocationIndex; // Built-in variable + llvm::GlobalVariable *m_builtInVariables[WorkGraphBuiltIns::Count]; // Built-in variable + llvm::SmallSet m_funcsToLower; // Function to lower + llvm::MapVector m_nodeNamesIdx; // Node names + llvm::DenseMap m_workGraphLibFuncNames; // Workgraph library functions names + llvm::SmallVector m_graphLibFuncs; // Workgraph library + llvm::Type *m_payloadArrayPtrType = nullptr; + CompilerUtils::TypeLowering *m_typeLowering = nullptr; + lgc::wg::ShaderEnqueueMode m_enqueueModes; + std::string m_inputSharedWithName; + + unsigned m_metaEnqueueId; // Shader enqueue meta id + lgc::Pipeline *m_pipeline; // Pipeline State + InputPayloadInfo m_inputPayloadInfo; // Input payload info + llvm::GlobalVariable *m_graphLds; // Graph Lds variable + bool m_threadLaunch; // Enable ThreadLaunch mode or not +}; +} // namespace Llpc diff --git a/llpc/lower/LowerGLCompatibility.cpp b/llpc/lowering/LowerGlCompatibility.cpp similarity index 95% rename from llpc/lower/LowerGLCompatibility.cpp rename to llpc/lowering/LowerGlCompatibility.cpp index a10f5bcd36..179812a755 100644 --- a/llpc/lower/LowerGLCompatibility.cpp +++ b/llpc/lowering/LowerGlCompatibility.cpp @@ -24,20 +24,20 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file LowerGLCompatibility.cpp - * @brief LLPC source file: contains implementation of class Llpc::LowerGLCompatibility. + * @file LowerGlCompatibility.cpp + * @brief LLPC source file: contains implementation of class Llpc::LowerGlCompatibility. *********************************************************************************************************************** */ -#include "LowerGLCompatibility.h" +#include "LowerGlCompatibility.h" +#include "LoweringUtil.h" #include "SPIRVInternal.h" #include "llpcContext.h" #include "llpcGraphicsContext.h" -#include "llpcSpirvLowerUtil.h" #include "lgc/Builder.h" #include "lgc/Pipeline.h" #include "llvm/IR/DerivedTypes.h" -#define DEBUG_TYPE "llpc-spirv-lower-gl-compatibility" +#define DEBUG_TYPE "lower-gl-compatibility" using namespace llvm; using namespace Llpc; @@ -45,7 +45,7 @@ using namespace Llpc; namespace Llpc { // ===================================================================================================================== -LowerGLCompatibility::LowerGLCompatibility() +LowerGlCompatibility::LowerGlCompatibility() : m_retInst(nullptr), m_entryPointEnd(nullptr), m_originalEntryBlock(nullptr), m_out(nullptr), m_clipVertex(nullptr), m_clipDistance(nullptr), m_clipPlane(nullptr), m_frontColor(nullptr), m_backColor(nullptr), m_frontSecondaryColor(nullptr), m_backSecondaryColor(nullptr), m_color(nullptr), m_secondaryColor(nullptr), @@ -57,9 +57,9 @@ LowerGLCompatibility::LowerGLCompatibility() // // @param [in/out] module : LLVM module to be run on // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses LowerGLCompatibility::run(Module &module, ModuleAnalysisManager &analysisManager) { +PreservedAnalyses LowerGlCompatibility::run(Module &module, ModuleAnalysisManager &analysisManager) { SpirvLower::init(&module); - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-gl-compatibility\n"); + LLVM_DEBUG(dbgs() << "Run the pass Lower-gl-compatibility\n"); if (!needRun()) return PreservedAnalyses::all(); @@ -113,7 +113,7 @@ PreservedAnalyses LowerGLCompatibility::run(Module &module, ModuleAnalysisManage // ===================================================================================================================== // Use to check whether need run the pass. -bool LowerGLCompatibility::needRun() { +bool LowerGlCompatibility::needRun() { bool result = false; if (m_context->getPipelineType() == PipelineType::Graphics) { auto moduleData = @@ -146,7 +146,7 @@ bool LowerGLCompatibility::needRun() { // Get location in meta data, if the global variable is UniformConstant. // // @param [in] var : Global variable to get uniform constant location -unsigned LowerGLCompatibility::getUniformLocation(llvm::GlobalVariable *var) { +unsigned LowerGlCompatibility::getUniformLocation(llvm::GlobalVariable *var) { assert(var->getType()->getAddressSpace() == SPIRAS_Uniform && var->hasMetadata(gSPIRVMD::UniformConstant)); MDNode *metaNode = var->getMetadata(gSPIRVMD::UniformConstant); return mdconst::extract(metaNode->getOperand(3))->getZExtValue(); @@ -159,7 +159,7 @@ unsigned LowerGLCompatibility::getUniformLocation(llvm::GlobalVariable *var) { // @param [in] mds : The metadata constant of InOut Global variable to be decode. // @param [in] index : The the index of the metadata in the embellish type. // @param [out] out : Use to output the element's metadatas of the InOut Global variable. -void LowerGLCompatibility::decodeInOutMetaRecursivelyByIndex(llvm::Type *valueTy, llvm::Constant *mds, +void LowerGlCompatibility::decodeInOutMetaRecursivelyByIndex(llvm::Type *valueTy, llvm::Constant *mds, ArrayRef index, llvm::SmallVector &out) { auto currentType = valueTy; @@ -204,7 +204,7 @@ void LowerGLCompatibility::decodeInOutMetaRecursivelyByIndex(llvm::Type *valueTy // @param [in] valueTy : The metadata's embellish type. // @param [in] mds : The metadata constant of InOut Global variable to be decode. // @param [out] out : Use to output the element's metadatas of the InOut Global variable. -void LowerGLCompatibility::decodeInOutMetaRecursively(llvm::Type *valueTy, llvm::Constant *mds, +void LowerGlCompatibility::decodeInOutMetaRecursively(llvm::Type *valueTy, llvm::Constant *mds, llvm::SmallVector &out) { ShaderInOutMetadata md = {}; if (valueTy->isSingleValueType()) { @@ -236,7 +236,7 @@ void LowerGLCompatibility::decodeInOutMetaRecursively(llvm::Type *valueTy, llvm: // Collect "Return" instructions and replace those instructions with a branch instruction point to "ReturnBlock". // // @param [in] func : The entry function of the shader module. -void LowerGLCompatibility::unifyFunctionReturn(Function *func) { +void LowerGlCompatibility::unifyFunctionReturn(Function *func) { SmallVector retInsts; for (BasicBlock &block : *func) { Instruction *terminator = block.getTerminator(); @@ -263,7 +263,7 @@ void LowerGLCompatibility::unifyFunctionReturn(Function *func) { // ===================================================================================================================== // Collect "EmitCall" instructions in the shader module. -void LowerGLCompatibility::collectEmitInst() { +void LowerGlCompatibility::collectEmitInst() { for (Function &function : m_module->functions()) { auto mangledName = function.getName(); // We get all users before iterating because the iterator can be invalidated @@ -281,7 +281,7 @@ void LowerGLCompatibility::collectEmitInst() { // ===================================================================================================================== // Build resource may used in compatibility emulation. -void LowerGLCompatibility::collectEmulationResource() { +void LowerGlCompatibility::collectEmulationResource() { // Collect emulation information. for (auto &global : m_module->globals()) { if (global.getType()->getAddressSpace() == SPIRAS_Uniform && global.hasMetadata(gSPIRVMD::UniformConstant)) { @@ -455,7 +455,7 @@ void LowerGLCompatibility::collectEmulationResource() { // ===================================================================================================================== // Acquire the patch pointer for do lower, function unifyFunctionReturn may cause IR change. -void LowerGLCompatibility::buildPatchPositionInfo() { +void LowerGlCompatibility::buildPatchPositionInfo() { if (m_shaderStage == ShaderStageGeometry) collectEmitInst(); else @@ -476,44 +476,44 @@ void LowerGLCompatibility::buildPatchPositionInfo() { // ===================================================================================================================== // Check whether need do lower for ClipVertex. -bool LowerGLCompatibility::needLowerClipVertex() { +bool LowerGlCompatibility::needLowerClipVertex() { return (m_clipVertex != nullptr && !m_clipVertex->user_empty()); } // ===================================================================================================================== // Check whether need do lower for FrontColor. -bool LowerGLCompatibility::needLowerFrontColor() { +bool LowerGlCompatibility::needLowerFrontColor() { return (m_frontColor != nullptr && !m_frontColor->user_empty()); } // ===================================================================================================================== // Check whether need do lower for BackColor. -bool LowerGLCompatibility::needLowerBackColor() { +bool LowerGlCompatibility::needLowerBackColor() { return (m_backColor != nullptr && !m_backColor->user_empty()); } // ===================================================================================================================== // Check whether need do lower for FrontSecondaryColor. -bool LowerGLCompatibility::needLowerFrontSecondaryColor() { +bool LowerGlCompatibility::needLowerFrontSecondaryColor() { return (m_frontSecondaryColor != nullptr && !m_frontSecondaryColor->user_empty()); } // ===================================================================================================================== // Check whether need do lower for BackSecondaryColor. -bool LowerGLCompatibility::needLowerBackSecondaryColor() { +bool LowerGlCompatibility::needLowerBackSecondaryColor() { return (m_backSecondaryColor != nullptr && !m_backSecondaryColor->user_empty()); } // ===================================================================================================================== // Check whether need do emulate for draw pixels. -bool LowerGLCompatibility::needEmulateDrawPixels() { +bool LowerGlCompatibility::needEmulateDrawPixels() { auto *buildInfo = static_cast(m_context->getPipelineBuildInfo()); return (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.drawPixelsType != Vkgc::DrawPixelsTypeNone); } // ===================================================================================================================== // Check whether need do emulate for two-side lighting. -bool LowerGLCompatibility::needEmulateTwoSideLighting() { +bool LowerGlCompatibility::needEmulateTwoSideLighting() { auto *buildInfo = static_cast(m_context->getPipelineBuildInfo()); return (m_shaderStage == ShaderStageFragment) && buildInfo->glState.enableTwoSideLighting && (m_color != nullptr || m_secondaryColor != nullptr); @@ -521,14 +521,14 @@ bool LowerGLCompatibility::needEmulateTwoSideLighting() { // ===================================================================================================================== // Check whether need do emulate for bitmap. -bool LowerGLCompatibility::needEmulateBitmap() { +bool LowerGlCompatibility::needEmulateBitmap() { auto *buildInfo = static_cast(m_context->getPipelineBuildInfo()); return (m_shaderStage == ShaderStageFragment) && buildInfo->glState.enableBitmap; } // ===================================================================================================================== // Check whether need do emulate point/line smooth and line/polygon stipple. -bool LowerGLCompatibility::needEmulateSmoothStipple() { +bool LowerGlCompatibility::needEmulateSmoothStipple() { auto options = m_context->getPipelineContext()->getPipelineOptions(); return (m_shaderStage == ShaderStageFragment) && (options->getGlState().enablePolygonStipple || options->getGlState().enableLineSmooth || @@ -537,14 +537,14 @@ bool LowerGLCompatibility::needEmulateSmoothStipple() { // ===================================================================================================================== // Check whether need do clamp fs -bool LowerGLCompatibility::needLowerFragColor() { +bool LowerGlCompatibility::needLowerFragColor() { auto buildInfo = static_cast(m_context->getPipelineBuildInfo()); return m_fragColor && (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.enableColorClampFs); } // ===================================================================================================================== // Check whether need do alphaTest. -bool LowerGLCompatibility::needLowerAlphaTest() { +bool LowerGlCompatibility::needLowerAlphaTest() { auto buildInfo = static_cast(m_context->getPipelineBuildInfo()); return (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.alphaTestFunc != Vkgc::AlphaTestFunc::Always); } @@ -553,7 +553,7 @@ bool LowerGLCompatibility::needLowerAlphaTest() { // Create InOut global variable Metadata. // // @param [in] md : The base information of the in/out meta date. -MDTuple *LowerGLCompatibility::createInOutMd(const ShaderInOutMetadata &md) { +MDTuple *LowerGlCompatibility::createInOutMd(const ShaderInOutMetadata &md) { auto int64Type = m_builder->getInt64Ty(); // Built metadata for the array element std::vector mdValues; @@ -575,7 +575,7 @@ MDTuple *LowerGLCompatibility::createInOutMd(const ShaderInOutMetadata &md) { // Create builtin InOut global variable Metadata. // // @param [in] builtIn : The built-in kind of the in/out meta date. -MDTuple *LowerGLCompatibility::createBuiltInInOutMd(lgc::BuiltInKind builtIn) { +MDTuple *LowerGlCompatibility::createBuiltInInOutMd(lgc::BuiltInKind builtIn) { ShaderInOutMetadata inOutMd = {}; inOutMd.IsBuiltIn = true; inOutMd.Value = builtIn; @@ -584,7 +584,7 @@ MDTuple *LowerGLCompatibility::createBuiltInInOutMd(lgc::BuiltInKind builtIn) { // ===================================================================================================================== // Create the SPIR-V output builtin variable "gl_ClipDistance". -void LowerGLCompatibility::createClipDistance() { +void LowerGlCompatibility::createClipDistance() { assert(m_clipDistance == nullptr); auto *buildInfo = static_cast(m_context->getPipelineBuildInfo()); uint32_t indexOfLastClipPlane = 0; @@ -638,7 +638,7 @@ void LowerGLCompatibility::createClipDistance() { // ===================================================================================================================== // Create the GLSL builtin variable "gl_ClipPlane". -void LowerGLCompatibility::createClipPlane() { +void LowerGlCompatibility::createClipPlane() { auto floatType = m_builder->getFloatTy(); auto vec4Type = FixedVectorType::get(floatType, 4); auto clipPlaneType = ArrayType::get(vec4Type, 8); @@ -667,7 +667,7 @@ void LowerGLCompatibility::createClipPlane() { // ===================================================================================================================== // Create the GLSL builtin variable "gl_BackColor". -void LowerGLCompatibility::createBackColor() { +void LowerGlCompatibility::createBackColor() { auto vec4Type = FixedVectorType::get(m_builder->getFloatTy(), 4); auto backColor = new GlobalVariable(*m_module, vec4Type, false, GlobalValue::ExternalLinkage, nullptr, "gl_BackColor", nullptr, GlobalVariable::GeneralDynamicTLSModel, SPIRV::SPIRAS_Input); @@ -682,7 +682,7 @@ void LowerGLCompatibility::createBackColor() { // ===================================================================================================================== // Create the GLSL builtin variable "gl_BackSecondaryColor". -void LowerGLCompatibility::createBackSecondaryColor() { +void LowerGlCompatibility::createBackSecondaryColor() { auto vec4Type = FixedVectorType::get(m_builder->getFloatTy(), 4); auto backSecondaryColor = new GlobalVariable(*m_module, vec4Type, false, GlobalValue::ExternalLinkage, nullptr, "gl_BackSecondaryColor", @@ -698,7 +698,7 @@ void LowerGLCompatibility::createBackSecondaryColor() { // ===================================================================================================================== // Create the GLSL builtin variable "gl_FrontFacing". -void LowerGLCompatibility::createFrontFacing() { +void LowerGlCompatibility::createFrontFacing() { assert(m_frontFacing == nullptr); auto frontFacing = new GlobalVariable(*m_module, m_builder->getInt1Ty(), false, GlobalValue::ExternalLinkage, nullptr, @@ -709,7 +709,7 @@ void LowerGLCompatibility::createFrontFacing() { // ===================================================================================================================== // Create the ARB builtin variable "patchTexCoord". -void LowerGLCompatibility::createPatchTexCoord() { +void LowerGlCompatibility::createPatchTexCoord() { auto vec2Type = FixedVectorType::get(m_builder->getFloatTy(), 2); auto patchTexCoord = new GlobalVariable(*m_module, vec2Type, false, GlobalValue::ExternalLinkage, nullptr, "patchTexCoord", nullptr, @@ -725,7 +725,7 @@ void LowerGLCompatibility::createPatchTexCoord() { // ===================================================================================================================== // Create the GLSL builtin variable "gl_FragDepth". -void LowerGLCompatibility::createFragDepth() { +void LowerGlCompatibility::createFragDepth() { assert(m_fragDepth == nullptr); auto fragDepth = new GlobalVariable(*m_module, m_builder->getFloatTy(), false, GlobalValue::ExternalLinkage, nullptr, @@ -736,7 +736,7 @@ void LowerGLCompatibility::createFragDepth() { // ===================================================================================================================== // Create the GLSL builtin variable "gl_fragStencilRef". -void LowerGLCompatibility::createFragStencilRef() { +void LowerGlCompatibility::createFragStencilRef() { assert(m_fragStencilRef == nullptr); auto fragStencilRef = new GlobalVariable(*m_module, m_builder->getInt32Ty(), false, GlobalValue::ExternalLinkage, nullptr, @@ -747,7 +747,7 @@ void LowerGLCompatibility::createFragStencilRef() { // ===================================================================================================================== // Inline the emulation instruction of clip vertex. -void LowerGLCompatibility::emulateStoreClipVertex() { +void LowerGlCompatibility::emulateStoreClipVertex() { auto floatType = m_builder->getFloatTy(); Type *vec4Type = VectorType::get(floatType, 4, false); // Load clipVertex @@ -776,7 +776,7 @@ void LowerGLCompatibility::emulateStoreClipVertex() { // Inline the emulation instruction of front/back/front secondary/back secondary color. // // @param [in] color : One of front/back/front secondary/back secondary color. -void LowerGLCompatibility::emulationOutputColor(llvm::User *color) { +void LowerGlCompatibility::emulationOutputColor(llvm::User *color) { auto floatType = m_builder->getFloatTy(); Type *vec4Type = VectorType::get(floatType, 4, false); // Load frontColor @@ -793,7 +793,7 @@ void LowerGLCompatibility::emulationOutputColor(llvm::User *color) { // ===================================================================================================================== // Emulate for draw pixels emulation. -void LowerGLCompatibility::emulateDrawPixels() { +void LowerGlCompatibility::emulateDrawPixels() { m_builder->SetInsertPoint(m_entryPoint->getEntryBlock().begin()); auto *buildInfo = static_cast(m_context->getPipelineBuildInfo()); auto floatType = m_builder->getFloatTy(); @@ -854,7 +854,7 @@ void LowerGLCompatibility::emulateDrawPixels() { // ===================================================================================================================== // Emulate for two-side lighting. -void LowerGLCompatibility::emulateTwoSideLighting() { +void LowerGlCompatibility::emulateTwoSideLighting() { auto vec4Type = FixedVectorType::get(m_builder->getFloatTy(), 4); if (m_shaderStage == ShaderStageFragment) { m_builder->SetInsertPoint(m_entryPoint->getEntryBlock().begin()); @@ -886,7 +886,7 @@ void LowerGLCompatibility::emulateTwoSideLighting() { // ===================================================================================================================== // Emulate for bitmap emulation. -void LowerGLCompatibility::emulateBitmap() { +void LowerGlCompatibility::emulateBitmap() { auto *buildInfo = static_cast(m_context->getPipelineBuildInfo()); m_builder->SetInsertPoint(m_entryPoint->getEntryBlock().begin()); auto floatType = m_builder->getFloatTy(); @@ -924,7 +924,7 @@ void LowerGLCompatibility::emulateBitmap() { // @param [in] valTy : current input value's type, should be global's valueType in top-level. // @param [in] metaVal : metadata value of current output variable. // @param [in] alphaScaleVal : calculated alpha scaling results, default value is one. -void LowerGLCompatibility::patchAlphaScaling(Value *val, Type *valTy, Constant *metaVal, Value *alphaScaleVal) { +void LowerGlCompatibility::patchAlphaScaling(Value *val, Type *valTy, Constant *metaVal, Value *alphaScaleVal) { ShaderInOutMetadata outputMeta = {}; if (valTy->isArrayTy()) { @@ -965,7 +965,7 @@ void LowerGLCompatibility::patchAlphaScaling(Value *val, Type *valTy, Constant * // ===================================================================================================================== // Emulate for point/line smooth and line/polygon stipple. -void LowerGLCompatibility::emulateSmoothStipple() { +void LowerGlCompatibility::emulateSmoothStipple() { auto options = m_context->getPipelineContext()->getPipelineOptions(); auto pipelineBuildInfo = static_cast(m_context->getPipelineBuildInfo()); bool needYInvert = pipelineBuildInfo->getGlState().originUpperLeft; @@ -1134,7 +1134,7 @@ void LowerGLCompatibility::emulateSmoothStipple() { // ===================================================================================================================== // Does lowering operations for GLSL variable "gl_ClipVertex". -void LowerGLCompatibility::lowerClipVertex() { +void LowerGlCompatibility::lowerClipVertex() { if (m_clipPlane == nullptr) createClipPlane(); if (m_clipDistance == nullptr) @@ -1158,7 +1158,7 @@ void LowerGLCompatibility::lowerClipVertex() { // "gl_BackSecondaryColor". // // @param [in] color : One of gl_FrontColor/gl_BackColor/gl_FrontSecondaryColor/gl_BackSecondaryColor. -void LowerGLCompatibility::lowerColor(llvm::User *color) { +void LowerGlCompatibility::lowerColor(llvm::User *color) { if (m_shaderStage == ShaderStageVertex || m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageTessEval || m_shaderStage == ShaderStageFragment) { assert(m_retInst != nullptr); @@ -1174,37 +1174,37 @@ void LowerGLCompatibility::lowerColor(llvm::User *color) { // ===================================================================================================================== // Does lowering operations for GLSL variable "gl_FrontColor". -void LowerGLCompatibility::lowerFrontColor() { +void LowerGlCompatibility::lowerFrontColor() { lowerColor(m_frontColor); } // ===================================================================================================================== // Does lowering operations for GLSL variable "gl_BackColor". -void LowerGLCompatibility::lowerBackColor() { +void LowerGlCompatibility::lowerBackColor() { lowerColor(m_backColor); } // ===================================================================================================================== // Does lowering operations for GLSL variable "gl_FrontSecondaryColor". -void LowerGLCompatibility::lowerFrontSecondaryColor() { +void LowerGlCompatibility::lowerFrontSecondaryColor() { lowerColor(m_frontSecondaryColor); } // ===================================================================================================================== // Does lowering operations for GLSL variable "gl_BackSecondaryColor". -void LowerGLCompatibility::lowerBackSecondaryColor() { +void LowerGlCompatibility::lowerBackSecondaryColor() { lowerColor(m_backSecondaryColor); } // ===================================================================================================================== // Does clamp fragment color -void LowerGLCompatibility::lowerFragColor() { +void LowerGlCompatibility::lowerFragColor() { lowerColor(m_fragColor); } // ===================================================================================================================== // Does lowering operations for alpha test. -void LowerGLCompatibility::lowerAlphaTest() { +void LowerGlCompatibility::lowerAlphaTest() { GlobalVariable *outputLocationZero = nullptr; auto floatTy = m_builder->getFloatTy(); Type *vec4Type = VectorType::get(floatTy, 4, false); @@ -1227,7 +1227,7 @@ void LowerGLCompatibility::lowerAlphaTest() { if (outputLocationZero != nullptr && outputLocationZero->getValueType()->isVectorTy()) { auto type = cast(outputLocationZero->getValueType()); uint32_t vectorNum = type->getNumElements(); - if (vectorNum != 4) + if (vectorNum != 4 || !type->getElementType()->isFloatTy()) return; } else return; diff --git a/llpc/lower/LowerGLCompatibility.h b/llpc/lowering/LowerGlCompatibility.h similarity index 95% rename from llpc/lower/LowerGLCompatibility.h rename to llpc/lowering/LowerGlCompatibility.h index a23c612c8e..6bfd8e2513 100644 --- a/llpc/lower/LowerGLCompatibility.h +++ b/llpc/lowering/LowerGlCompatibility.h @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -24,14 +24,14 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file LowerGLCompatibility.h - * @brief LLPC header file: contains declaration of Llpc::LowerGLCompatibility + * @file LowerGlCompatibility.h + * @brief LLPC header file: contains declaration of Llpc::LowerGlCompatibility *********************************************************************************************************************** */ #pragma once +#include "Lowering.h" #include "SPIRVInternal.h" -#include "llpcSpirvLower.h" #include "lgc/Builder.h" #include "lgc/LgcDialect.h" #include "llvm/IR/PassManager.h" @@ -41,9 +41,9 @@ namespace Llpc { // ===================================================================================================================== // Represents the pass of SPIR-V lowering ray query post inline. -class LowerGLCompatibility : public SpirvLower, public llvm::PassInfoMixin { +class LowerGlCompatibility : public SpirvLower, public llvm::PassInfoMixin { public: - LowerGLCompatibility(); + LowerGlCompatibility(); llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); static llvm::StringRef name() { return "Lower GLSL compatibility variables and operations"; } diff --git a/llpc/lower/LowerGlobals.cpp b/llpc/lowering/LowerGlobals.cpp similarity index 96% rename from llpc/lower/LowerGlobals.cpp rename to llpc/lowering/LowerGlobals.cpp index 22cc6c806a..475fae5df8 100644 --- a/llpc/lower/LowerGlobals.cpp +++ b/llpc/lowering/LowerGlobals.cpp @@ -29,12 +29,12 @@ *********************************************************************************************************************** */ #include "LowerGlobals.h" +#include "LoweringUtil.h" #include "SPIRVInternal.h" #include "llpcContext.h" #include "llpcDebug.h" #include "llpcGraphicsContext.h" #include "llpcRayTracingContext.h" -#include "llpcSpirvLowerUtil.h" #include "compilerutils/CompilerUtils.h" #include "compilerutils/TypesMetadata.h" #include "lgc/LgcDialect.h" @@ -1963,6 +1963,79 @@ void LowerGlobals::lowerUniformConstants() { globalUsers[inst->getFunction()].push_back(inst); } + // Replace uniform constant variable with a compile time constants if it is set from driver side. + Vkgc::CompileConstInfo *compileTimeConstsInfo = + m_context->getPipelineContext()->getPipelineOptions()->compileConstInfo; + + if (compileTimeConstsInfo && compileTimeConstsInfo->numCompileTimeConstants > 0) { + GlobalVariable *compileTimeConstVal = nullptr; + bool foundGlobal = false; + MDNode *metaNode = global.getMetadata(gSPIRVMD::UniformConstant); + auto uniformConstantsSet = mdconst::extract(metaNode->getOperand(0))->getZExtValue(); + auto uniformConstantsBinding = mdconst::extract(metaNode->getOperand(1))->getZExtValue(); + auto uniformConstantsOffset = mdconst::dyn_extract(metaNode->getOperand(2))->getZExtValue(); + for (uint32_t i = 0; i < compileTimeConstsInfo->numCompileTimeConstants; i++) { + auto specializeUniformInfo = compileTimeConstsInfo->pCompileTimeConstants[i]; + if (specializeUniformInfo.offset == uniformConstantsOffset && + specializeUniformInfo.set == uniformConstantsSet && + specializeUniformInfo.binding == uniformConstantsBinding) { + // determine result constant type. + foundGlobal = true; + uint32_t uniformChannelCount = 1; + uint32_t uniformChannelBytesCount = 1; + Type *uniformTy = global.getValueType(); + assert(!uniformTy->isStructTy()); + Type *constTy = uniformTy; + + if (auto *vectorUniformTy = dyn_cast(uniformTy)) { + constTy = vectorUniformTy->getElementType(); + uniformChannelCount = vectorUniformTy->getElementCount().getFixedValue(); + } + uniformChannelBytesCount = constTy->getScalarSizeInBits() / (sizeof(uint8_t) * 8); + + if (uniformChannelBytesCount * uniformChannelCount != specializeUniformInfo.validBytes) { + // Don't support partial replacement now (like vector component partial replacement). + continue; + } + + // Construct constants + Constant *constData[16] = {}; + compileTimeConstVal = new GlobalVariable(uniformTy, true, global.getLinkage(), nullptr, "", + GlobalValue::NotThreadLocal, SPIRAS_Private); + for (uint32_t i = 0; i < uniformChannelCount; i++) { + if (uniformTy->isFloatingPointTy()) { + double data = 0.0; + memcpy(&data, specializeUniformInfo.values.u8 + i * uniformChannelBytesCount, uniformChannelBytesCount); + constData[i] = ConstantFP::get(constTy, data); + } else { + uint64_t data = 0; + memcpy(&data, specializeUniformInfo.values.u8 + i * uniformChannelBytesCount, uniformChannelBytesCount); + constData[i] = ConstantInt::get(constTy, data); + } + } + + // Replace current uniform with known compile time constants. + Constant *initializer = uniformChannelCount > 1 ? ConstantVector::get(constData) : constData[0]; + compileTimeConstVal->setInitializer(initializer); + for (auto &eachFunc : globalUsers) { + for (auto *inst : eachFunc.second) { + inst->replaceUsesOfWith(&global, compileTimeConstVal); + } + } + + // Insert new global to list and remove replaced global variable. + global.getParent()->insertGlobalVariable(compileTimeConstVal); + mapGlobalVariableToProxy(compileTimeConstVal); + globalsToRemove.push_back(&global); + break; + } + } + // If replacement happens, skip following buffer load convert. + if (foundGlobal) { + continue; + } + } + for (auto &eachFunc : globalUsers) { MDNode *metaNode = global.getMetadata(gSPIRVMD::UniformConstant); auto uniformConstantsSet = mdconst::extract(metaNode->getOperand(0))->getZExtValue(); diff --git a/llpc/lower/LowerGlobals.h b/llpc/lowering/LowerGlobals.h similarity index 99% rename from llpc/lower/LowerGlobals.h rename to llpc/lowering/LowerGlobals.h index 86ef63bc06..f4fc2c5b3d 100644 --- a/llpc/lower/LowerGlobals.h +++ b/llpc/lowering/LowerGlobals.h @@ -30,8 +30,8 @@ */ #pragma once +#include "Lowering.h" #include "SPIRVInternal.h" -#include "llpcSpirvLower.h" #include "vkgcDefs.h" #include "lgc/Builder.h" #include "llvm/IR/IRBuilder.h" diff --git a/llpc/lowering/LowerGraphLibrary.cpp b/llpc/lowering/LowerGraphLibrary.cpp new file mode 100644 index 0000000000..aa0672051c --- /dev/null +++ b/llpc/lowering/LowerGraphLibrary.cpp @@ -0,0 +1,268 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file LowerGraphLibrary.cpp + * @brief LLPC source file: contains implementation of class Llpc::LowerGraphLibrary. + *********************************************************************************************************************** + */ + +#include "LowerGraphLibrary.h" +#include "LowerInternalLibraryIntrinsic.h" +#include "SPIRVInternal.h" +#include "lgc/Builder.h" +#include "lgc/BuiltIns.h" +#include "lgc/LgcWgDialect.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +#define DEBUG_TYPE "lower-graph-library" + +using namespace llvm; +using namespace Llpc; +using namespace lgc; +extern const char *WorkGraphNames[]; +constexpr unsigned WorkGraphFuncCount = 16; + +namespace AmdExtFunc { +enum : unsigned { + BackingStore = 0, // Backing store + ShaderDirectory, // Shader Directory + NodeDispatchInfo1, // Node Dispatch Info1 + NodeDispatchInfo2, // Node Dispatch Info2 + TraceBuffer, // Trace Buffer + LdsLoadDword, // Lds load dword + LdsStoreDword, // Lds store dword + LdsAtomicAddDword, // Lds atomic add + OutputCount, // Lds output count + Count +}; +} + +static const char *AmdExtNames[] = { + "AmdWorkGraphsBackingStore", "AmdWorkGraphsShaderDirectory", "AmdWorkGraphsNodeDispatchInfo1", + "AmdWorkGraphsNodeDispatchInfo2", "AmdWorkGraphsTraceBuffer", "AmdWorkGraphsLdsLoadDword", + "AmdWorkGraphsLdsStoreDword", "AmdWorkGraphsLdsAtomicAddDword", "AmdWorkGraphsOutputCount"}; + +// ===================================================================================================================== +LowerGraphLibrary::LowerGraphLibrary() { + for (unsigned i = 0; i < AmdExtFunc::Count; ++i) { + m_extFuncNames[AmdExtNames[i]] = i; + } + for (unsigned i = 0; i < WorkGraphFuncCount; ++i) + m_workgraphNames.insert(WorkGraphNames[i]); +} + +// ===================================================================================================================== +// Executes this LLVM patching pass on the specified LLVM module. +// +// @param [in/out] module : LLVM module to be run on +// @param [in/out] analysisManager : Analysis manager to use for this transformation +PreservedAnalyses LowerGraphLibrary::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-execution-graph\n"); + SpirvLower::init(&module); + for (auto funcIt = module.begin(), funcEnd = module.end(); funcIt != funcEnd;) { + Function *func = &*funcIt++; + processLibraryFunction(func); + } + return PreservedAnalyses::none(); +} + +// ===================================================================================================================== +// Clear the block before patching the function +// +// @param func : The function to clear +BasicBlock *LowerGraphLibrary::clearBlock(Function *func) { + assert(func->size() == 1); + BasicBlock &entryBlock = func->getEntryBlock(); + for (auto instIt = entryBlock.begin(); instIt != entryBlock.end();) { + auto &inst = *instIt++; + inst.eraseFromParent(); + } + return &entryBlock; +} + +// ===================================================================================================================== +// Clear the block before patching the function +// +// @param func : The function to process +void LowerGraphLibrary::processLibraryFunction(Function *&func) { + LibraryFuncPtr amdLibraryFuncs[] = { + &LowerGraphLibrary::createBackingStore, &LowerGraphLibrary::createShaderDirectory, + &LowerGraphLibrary::createNodeDispatchInfo1, &LowerGraphLibrary::createNodeDispatchInfo2, + &LowerGraphLibrary::createTraceBuffer, &LowerGraphLibrary::createLdsLoadDword, + &LowerGraphLibrary::createLdsStoreDword, &LowerGraphLibrary::createLdsAtomicAddDword, + &LowerGraphLibrary::createOutputCount}; + + if (m_workgraphNames.find(func->getName()) != m_workgraphNames.end()) { + func->setLinkage(GlobalValue::WeakAnyLinkage); + return; + } + auto funcIt = m_extFuncNames.find(func->getName()); + + if (funcIt != m_extFuncNames.end()) { + auto funcIdx = funcIt->second; + (this->*amdLibraryFuncs[funcIdx])(func, funcIdx); + return; + } + + auto &commonFuncTable = InternalLibraryIntrinsicUtil::LibraryFunctionTable::get().m_libFuncPtrs; + auto commonFuncIt = commonFuncTable.find(func->getName()); + if (commonFuncIt != commonFuncTable.end()) { + auto funcPtr = commonFuncIt->second; + m_builder->SetInsertPoint(clearBlock(func)); + (*funcPtr)(func, m_builder); + } +} + +// ===================================================================================================================== +// Create Backing store +// +// @param func : The function to process +// @param funcId : The function ID +void LowerGraphLibrary::createBackingStore(Function *func, unsigned funcId) { + assert(funcId == AmdExtFunc::BackingStore); + m_builder->SetInsertPoint(clearBlock(func)); + m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInGraphControlStruct)); +} + +// ===================================================================================================================== +// Create Shader Directory +// +// @param func : The function to process +// @param funcId : The function ID +void LowerGraphLibrary::createShaderDirectory(Function *func, unsigned funcId) { + assert(funcId == AmdExtFunc::ShaderDirectory); + m_builder->SetInsertPoint(clearBlock(func)); + m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInShaderDirectory)); +} + +// ===================================================================================================================== +// Create Node Dispatch Info1 +// +// @param func : The function to process +// @param funcId : The function ID +void LowerGraphLibrary::createNodeDispatchInfo1(Function *func, unsigned funcId) { + assert(funcId == AmdExtFunc::NodeDispatchInfo1); + m_builder->SetInsertPoint(clearBlock(func)); + m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInNodeDispatchInfo1)); +} + +// ===================================================================================================================== +// Create Node Dispatch Info2 +// +// @param func : The function to process +// @param funcId : The function ID +void LowerGraphLibrary::createNodeDispatchInfo2(Function *func, unsigned funcId) { + assert(funcId == AmdExtFunc::NodeDispatchInfo2); + m_builder->SetInsertPoint(clearBlock(func)); + m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInNodeDispatchInfo2)); +} + +// ===================================================================================================================== +// Create Trace Buffer +// +// @param func : The function to process +// @param funcId : The function ID +void LowerGraphLibrary::createTraceBuffer(Function *func, unsigned funcId) { + assert(funcId == AmdExtFunc::TraceBuffer); + m_builder->SetInsertPoint(clearBlock(func)); + m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInWorkGraphTraceBuf)); +} + +// ===================================================================================================================== +// Create Load DWORD from lds +// +// @param func : The function to process +// @param funcId : The function ID +void LowerGraphLibrary::createLdsLoadDword(Function *func, unsigned funcId) { + assert(funcId == AmdExtFunc::LdsLoadDword); + // AmdWorkGraphsLdsLoadDword(uint offset) in byte + m_builder->SetInsertPoint(clearBlock(func)); + Value *offset = func->getArg(0); + offset = m_builder->CreateLoad(m_builder->getInt32Ty(), offset); + // convert offset from BYTE to DWORD + offset = m_builder->CreateLShr(offset, 2); + auto graphLds = m_builder->create(); + auto ldsPtr = m_builder->CreateGEP(m_builder->getInt32Ty(), graphLds, {offset}); + // Load value from lds position + Value *ldsValue = m_builder->CreateLoad(m_builder->getInt32Ty(), ldsPtr); + m_builder->CreateRet(ldsValue); +} + +// ===================================================================================================================== +// Create store DWORD to lds +// +// @param func : The function to process +// @param funcId : The function ID +void LowerGraphLibrary::createLdsStoreDword(Function *func, unsigned funcId) { + assert(funcId == AmdExtFunc::LdsStoreDword); + // void AmdWorkGraphsLdsStoreDword(uint offset, uint value) + m_builder->SetInsertPoint(clearBlock(func)); + Value *offset = func->getArg(0); + offset = m_builder->CreateLoad(m_builder->getInt32Ty(), offset); + // convert offset from BYTE to DWORD + offset = m_builder->CreateLShr(offset, 2); + Value *value = func->getArg(1); + value = m_builder->CreateLoad(m_builder->getInt32Ty(), value); + auto graphLds = m_builder->create(); + auto ldsPtr = m_builder->CreateGEP(m_builder->getInt32Ty(), graphLds, {offset}); + m_builder->CreateStore(value, ldsPtr); + m_builder->CreateRetVoid(); +} + +// ===================================================================================================================== +// Create atomic add DWORD to lds +// +// @param func : The function to process +// @param funcId : The function ID +void LowerGraphLibrary::createLdsAtomicAddDword(Function *func, unsigned funcId) { + assert(funcId == AmdExtFunc::LdsAtomicAddDword); + // AmdWorkGraphsLdsAtomicAddDword(uint offset, uint value) + m_builder->SetInsertPoint(clearBlock(func)); + Value *offset = func->getArg(0); + offset = m_builder->CreateLoad(m_builder->getInt32Ty(), offset); + // convert offset from BYTE to DWORD + offset = m_builder->CreateLShr(offset, 2); + Value *value = func->getArg(1); + value = m_builder->CreateLoad(m_builder->getInt32Ty(), value); + auto graphLds = m_builder->create(); + auto ldsPtr = m_builder->CreateGEP(m_builder->getInt32Ty(), graphLds, {offset}); + m_builder->CreateAtomicRMW(AtomicRMWInst::Add, ldsPtr, value, MaybeAlign(), AtomicOrdering::Monotonic, + SyncScope::System); + m_builder->CreateRetVoid(); +} + +// ===================================================================================================================== +// Create output count +// +// @param func : The function to process +// @param funcId : The function ID +void LowerGraphLibrary::createOutputCount(Function *func, unsigned funcId) { + assert(funcId == AmdExtFunc::OutputCount); + // uint AmdWorkgraphsOutputCount() + m_builder->SetInsertPoint(clearBlock(func)); + auto outputCount = m_builder->create(); + m_builder->CreateRet(outputCount); +} diff --git a/llpc/lowering/LowerGraphLibrary.h b/llpc/lowering/LowerGraphLibrary.h new file mode 100644 index 0000000000..73bcba1f5d --- /dev/null +++ b/llpc/lowering/LowerGraphLibrary.h @@ -0,0 +1,68 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file LowerGraphLibrary.h + * @brief LLPC header file: contains declaration of Llpc::LowerGraphLibrary + *********************************************************************************************************************** + */ +#pragma once + +#include "Lowering.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { +class BasicBlock; +} // namespace llvm + +namespace Llpc { + +// ===================================================================================================================== +// Represents the pass of SPIR-V lowering graph library +class LowerGraphLibrary : public SpirvLower, public llvm::PassInfoMixin { +public: + LowerGraphLibrary(); + llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); + static llvm::StringRef name() { return "Lower SPIR-V library shader"; } + +private: + typedef void (LowerGraphLibrary::*LibraryFuncPtr)(llvm::Function *, unsigned); + void processLibraryFunction(llvm::Function *&func); + llvm::BasicBlock *clearBlock(llvm::Function *func); + void createBackingStore(llvm::Function *func, unsigned); + void createShaderDirectory(llvm::Function *func, unsigned); + void createNodeDispatchInfo1(llvm::Function *func, unsigned); + void createNodeDispatchInfo2(llvm::Function *func, unsigned); + void createTraceBuffer(llvm::Function *func, unsigned); + void createLdsLoadDword(llvm::Function *func, unsigned); + void createLdsStoreDword(llvm::Function *func, unsigned); + void createLdsAtomicAddDword(llvm::Function *func, unsigned); + void createOutputCount(llvm::Function *func, unsigned); + llvm::DenseSet m_workgraphNames; // External linked workgraph functions + llvm::DenseMap m_extFuncNames; // Library functions to patch +}; +}; // namespace Llpc diff --git a/llpc/lower/LowerInstMetaRemove.cpp b/llpc/lowering/LowerInstMetaRemove.cpp similarity index 87% rename from llpc/lower/LowerInstMetaRemove.cpp rename to llpc/lowering/LowerInstMetaRemove.cpp index 2a826d8cad..5967d25f3f 100644 --- a/llpc/lower/LowerInstMetaRemove.cpp +++ b/llpc/lowering/LowerInstMetaRemove.cpp @@ -25,7 +25,7 @@ /** *********************************************************************************************************************** * @file LowerInstMetaRemove.cpp - * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerInstMetaRemove. + * @brief LLPC source file: contains implementation of class Llpc::LowerInstMetaRemove. *********************************************************************************************************************** */ #include "LowerInstMetaRemove.h" @@ -34,7 +34,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#define DEBUG_TYPE "llpc-spirv-lower-inst-meta-remove" +#define DEBUG_TYPE "lower-inst-meta-remove" using namespace llvm; using namespace SPIRV; @@ -43,7 +43,7 @@ using namespace Llpc; namespace Llpc { // ===================================================================================================================== -SpirvLowerInstMetaRemove::SpirvLowerInstMetaRemove() : m_changed(false) { +LowerInstMetaRemove::LowerInstMetaRemove() { } // ===================================================================================================================== @@ -51,11 +51,11 @@ SpirvLowerInstMetaRemove::SpirvLowerInstMetaRemove() : m_changed(false) { // // @param [in/out] module : LLVM module to be run on // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerInstMetaRemove::run(Module &module, ModuleAnalysisManager &analysisManager) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Inst-Meta-Remove\n"); +PreservedAnalyses LowerInstMetaRemove::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-Inst-Meta-Remove\n"); SpirvLower::init(&module); - m_changed = false; + bool changed = false; // Remove calls to functions whose names start with "spirv.NonUniform". SmallVector callsToRemove; @@ -72,7 +72,7 @@ PreservedAnalyses SpirvLowerInstMetaRemove::run(Module &module, ModuleAnalysisMa for (auto *callInst : callsToRemove) { callInst->dropAllReferences(); callInst->eraseFromParent(); - m_changed = true; + changed = true; } // Remove any named metadata in the module that starts "spirv.". @@ -83,10 +83,10 @@ PreservedAnalyses SpirvLowerInstMetaRemove::run(Module &module, ModuleAnalysisMa } for (NamedMDNode *namedMdNode : nodesToRemove) { namedMdNode->eraseFromParent(); - m_changed = true; + changed = true; } - return m_changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + return changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } } // namespace Llpc diff --git a/llpc/lower/LowerInstMetaRemove.h b/llpc/lowering/LowerInstMetaRemove.h similarity index 90% rename from llpc/lower/LowerInstMetaRemove.h rename to llpc/lowering/LowerInstMetaRemove.h index 7a5343abe9..f8f5a85b88 100644 --- a/llpc/lower/LowerInstMetaRemove.h +++ b/llpc/lowering/LowerInstMetaRemove.h @@ -25,28 +25,25 @@ /** *********************************************************************************************************************** * @file LowerInstMetaRemove.h - * @brief LLPC header file: contains declaration of class Llpc::SpirvLowerInstMetaRemove. + * @brief LLPC header file: contains declaration of class Llpc::LowerInstMetaRemove. *********************************************************************************************************************** */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/IR/PassManager.h" namespace Llpc { // ===================================================================================================================== // Represents the pass of SPIR-V lowering operations for removing the instruction metadata. -class SpirvLowerInstMetaRemove : public SpirvLower, public llvm::PassInfoMixin { +class LowerInstMetaRemove : public SpirvLower, public llvm::PassInfoMixin { public: - SpirvLowerInstMetaRemove(); + LowerInstMetaRemove(); llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); static llvm::StringRef name() { return "Lower SPIR-V instruction metadata by removing those targeted"; } - -private: - bool m_changed; // Whether the module is changed }; } // namespace Llpc diff --git a/llpc/lower/LowerInternalLibraryIntrinsic.cpp b/llpc/lowering/LowerInternalLibraryIntrinsic.cpp similarity index 94% rename from llpc/lower/LowerInternalLibraryIntrinsic.cpp rename to llpc/lowering/LowerInternalLibraryIntrinsic.cpp index 2b184a0006..8090df71d6 100644 --- a/llpc/lower/LowerInternalLibraryIntrinsic.cpp +++ b/llpc/lowering/LowerInternalLibraryIntrinsic.cpp @@ -68,6 +68,26 @@ static void createHalt(Function *func, Builder *builder) { builder->CreateRetVoid(); } +// ===================================================================================================================== +// Create device scope memory_order_acquire +// +// @param func : The function to process +// @param builder : The IR builder +static void createDeviceMemoryAcquire(Function *func, Builder *builder) { + builder->CreateFence(AtomicOrdering::Acquire, builder->getContext().getOrInsertSyncScopeID("agent")); + builder->CreateRetVoid(); +} + +// ===================================================================================================================== +// Create device scope memory_order_release +// +// @param func : The function to process +// @param builder : The IR builder +static void createDeviceMemoryRelease(Function *func, Builder *builder) { + builder->CreateFence(AtomicOrdering::Release, builder->getContext().getOrInsertSyncScopeID("agent")); + builder->CreateRetVoid(); +} + // ===================================================================================================================== // Create function to compute the number of waves in the workgroup // @@ -372,6 +392,8 @@ InternalLibraryIntrinsicUtil::LibraryFunctionTable::LibraryFunctionTable() { m_libFuncPtrs["AmdExtLaneIndex"] = &createLaneIndex; m_libFuncPtrs["AmdExtLaneCount"] = &createLaneCount; m_libFuncPtrs["AmdExtHalt"] = &createHalt; + m_libFuncPtrs["AmdExtDeviceMemoryAcquire"] = &createDeviceMemoryAcquire; + m_libFuncPtrs["AmdExtDeviceMemoryRelease"] = &createDeviceMemoryRelease; m_libFuncPtrs["AmdExtNumWavesCompute"] = &createNumWavesCompute; m_libFuncPtrs["AmdExtWaveIndexCompute"] = &createWaveIndexCompute; m_libFuncPtrs["AmdExtGroupIdCompute"] = &createGroupIdCompute; diff --git a/llpc/lower/LowerInternalLibraryIntrinsic.h b/llpc/lowering/LowerInternalLibraryIntrinsic.h similarity index 100% rename from llpc/lower/LowerInternalLibraryIntrinsic.h rename to llpc/lowering/LowerInternalLibraryIntrinsic.h diff --git a/llpc/lower/LowerMath.cpp b/llpc/lowering/LowerMath.cpp similarity index 89% rename from llpc/lower/LowerMath.cpp rename to llpc/lowering/LowerMath.cpp index f5280171a4..a097d0dbc9 100644 --- a/llpc/lower/LowerMath.cpp +++ b/llpc/lowering/LowerMath.cpp @@ -25,15 +25,15 @@ /** *********************************************************************************************************************** * @file LowerMath.cpp - * @brief LLPC source file: implementations of Llpc::SpirvLowerMathConstFolding and Llpc::SpirvLowerMathFloatOp. + * @brief LLPC source file: implementations of Llpc::LowerMathConstFolding and Llpc::LowerMathFloatOp. *********************************************************************************************************************** */ #include "LowerMath.h" +#include "Lowering.h" #include "SPIRVInternal.h" #include "hex_float.h" #include "llpcContext.h" #include "llpcGraphicsContext.h" -#include "llpcSpirvLower.h" #include "lgc/Builder.h" #include "lgc/Pipeline.h" #include "llvm/Analysis/ConstantFolding.h" @@ -46,9 +46,9 @@ #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Local.h" -#define DEBUG_TYPE_CONST_FOLDING "llpc-spirv-lower-math-const-folding" -#define DEBUG_TYPE_PRECISION "llpc-spirv-lower-math-precision" -#define DEBUG_TYPE_FLOAT_OP "llpc-spirv-lower-math-float-op" +#define DEBUG_TYPE_CONST_FOLDING "lower-math-const-folding" +#define DEBUG_TYPE_PRECISION "lower-math-precision" +#define DEBUG_TYPE_FLOAT_OP "lower-math-float-op" using namespace lgc; using namespace llvm; @@ -63,9 +63,12 @@ static cl::opt BackwardPropagateNoContract("backward-propagate-no-contract", cl::desc("Backward propagate NoContraction decorations to input operations"), cl::init(false)); +static cl::opt DisableGlPositionOpt("disable-gl-position-opt", + cl::desc("Disable all use of fast math flags on gl_Position"), + cl::init(false)); // ===================================================================================================================== -SpirvLowerMath::SpirvLowerMath() +LowerMath::LowerMath() : m_changed(false), m_fp16DenormFlush(false), m_fp32DenormFlush(false), m_fp64DenormFlush(false), m_fp16RoundToZero(false) { } @@ -88,7 +91,7 @@ static void setFpMathAttribute(Function &func, bool fp32, FpDenormMode denormMod // Initialise transform class. // // @param [in/out] module : LLVM module to be run on -void SpirvLowerMath::init(Module &module) { +void LowerMath::init(Module &module) { SpirvLower::init(&module); m_changed = false; @@ -115,7 +118,7 @@ void SpirvLowerMath::init(Module &module) { // Checks desired denormal flush behavior and inserts llvm.canonicalize. // // @param inst : Instruction to flush denormals if needed -void SpirvLowerMath::flushDenormIfNeeded(Instruction *inst) { +void LowerMath::flushDenormIfNeeded(Instruction *inst) { auto destTy = inst->getType(); if ((destTy->getScalarType()->isHalfTy() && m_fp16DenormFlush) || (destTy->getScalarType()->isFloatTy() && m_fp32DenormFlush) || @@ -149,7 +152,8 @@ static bool isNoContract(Value *value) { // Disable fast math for all values related with the specified value // // @param value : Value to disable fast math for -static void disableFastMath(Value *value) { +// @param clearAll : Whether to clear all flags, including nnan and nsz +static void disableFastMath(Value *value, bool clearAll) { std::set allValues; std::list workSet; if (isa(value)) { @@ -160,10 +164,15 @@ static void disableFastMath(Value *value) { auto it = workSet.begin(); while (!workSet.empty()) { if (isa(*it)) { - // Reset fast math flags to default + // Reset fast math flags to default, but maintain nsz and nnan as required. auto inst = cast(*it); - FastMathFlags fastMathFlags; - inst->copyFastMathFlags(fastMathFlags); + FastMathFlags newFmf; + if (!clearAll) { + FastMathFlags instFmf = inst->getFastMathFlags(); + newFmf.setNoSignedZeros(instFmf.noSignedZeros()); + newFmf.setNoNaNs(instFmf.noNaNs()); + } + inst->copyFastMathFlags(newFmf); } for (Value *operand : (*it)->operands()) { @@ -188,10 +197,10 @@ static void disableFastMath(Value *value) { // // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerMathConstFolding::run(Module &module, ModuleAnalysisManager &analysisManager) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Math-Const-Folding\n"); +PreservedAnalyses LowerMathConstFolding::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-Math-Const-Folding\n"); - SpirvLowerMath::init(module); + LowerMath::init(module); if (m_shaderStage == ShaderStageInvalid) return PreservedAnalyses::all(); @@ -247,14 +256,14 @@ PreservedAnalyses SpirvLowerMathConstFolding::run(Module &module, ModuleAnalysis // ===================================================================================================================== // Return the module entry point function. -Function *SpirvLowerMathConstFolding::getEntryPoint() { +Function *LowerMathConstFolding::getEntryPoint() { return m_entryPoint; } #undef DEBUG_TYPE // DEBUG_TYPE_CONST_FOLDING #define DEBUG_TYPE DEBUG_TYPE_PRECISION -bool SpirvLowerMathPrecision::adjustExports(Module &module) { +bool LowerMathPrecision::adjustExports(Module &module, bool disablePositionOpt) { bool changed = false; for (auto &func : module.functions()) { // Disable fast math for gl_Position. @@ -282,7 +291,7 @@ bool SpirvLowerMathPrecision::adjustExports(Module &module) { } if (valueWritten && builtIn == lgc::BuiltInPosition) { - disableFastMath(valueWritten); + disableFastMath(valueWritten, disablePositionOpt); changed = true; } } @@ -301,7 +310,7 @@ static bool clearContractFlag(Instruction *inst) { return true; } -bool SpirvLowerMathPrecision::propagateNoContract(Module &module, bool forward, bool backward) { +bool LowerMathPrecision::propagateNoContract(Module &module, bool forward, bool backward) { bool changed = false; SmallVector roots; @@ -373,8 +382,8 @@ bool SpirvLowerMathPrecision::propagateNoContract(Module &module, bool forward, // // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisManager &analysisManager) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Math-Precision\n"); +PreservedAnalyses LowerMathPrecision::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-Math-Precision\n"); SpirvLower::init(&module); if (m_shaderStage == ShaderStageInvalid) @@ -382,18 +391,21 @@ PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisMan bool forwardPropagate = false; bool backwardPropagate = false; + bool disableGlPositionOpt = false; auto pipelineContext = m_context->getPipelineContext(); switch (pipelineContext->getPipelineType()) { case PipelineType::Graphics: { auto shaderInfo = (static_cast(pipelineContext))->getPipelineShaderInfo(m_shaderStage); forwardPropagate = forwardPropagate || shaderInfo->options.forwardPropagateNoContract; backwardPropagate = backwardPropagate || shaderInfo->options.backwardPropagateNoContract; + disableGlPositionOpt = shaderInfo->options.disableGlPositionOpt; break; } case PipelineType::Compute: { auto shaderInfo = &(static_cast(pipelineContext->getPipelineBuildInfo()))->cs; forwardPropagate = forwardPropagate || shaderInfo->options.forwardPropagateNoContract; backwardPropagate = backwardPropagate || shaderInfo->options.backwardPropagateNoContract; + disableGlPositionOpt = shaderInfo->options.disableGlPositionOpt; break; } case PipelineType::RayTracing: { @@ -405,6 +417,7 @@ PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisMan continue; forwardPropagate = forwardPropagate || pipelineInfo->pShaders[i].options.forwardPropagateNoContract; backwardPropagate = backwardPropagate || pipelineInfo->pShaders[i].options.backwardPropagateNoContract; + disableGlPositionOpt = pipelineInfo->pShaders[i].options.disableGlPositionOpt; } break; } @@ -416,10 +429,12 @@ PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisMan forwardPropagate = ForwardPropagateNoContract; if (BackwardPropagateNoContract.getNumOccurrences()) backwardPropagate = BackwardPropagateNoContract; + if (DisableGlPositionOpt.getNumOccurrences()) + disableGlPositionOpt = DisableGlPositionOpt; bool adjustedExports = false; if (pipelineContext->getPipelineOptions()->enableImplicitInvariantExports) - adjustedExports = adjustExports(module); + adjustedExports = adjustExports(module, disableGlPositionOpt); bool propagatedNoContract = false; if (forwardPropagate || backwardPropagate) @@ -436,10 +451,10 @@ PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisMan // // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerMathFloatOp::run(Module &module, ModuleAnalysisManager &analysisManager) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Math-Float-Op\n"); +PreservedAnalyses LowerMathFloatOp::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-Math-Float-Op\n"); - SpirvLowerMath::init(module); + LowerMath::init(module); visit(m_module); return m_changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); @@ -449,7 +464,7 @@ PreservedAnalyses SpirvLowerMathFloatOp::run(Module &module, ModuleAnalysisManag // Visits binary operator instruction. // // @param binaryOp : Binary operator instruction -void SpirvLowerMathFloatOp::visitBinaryOperator(BinaryOperator &binaryOp) { +void LowerMathFloatOp::visitBinaryOperator(BinaryOperator &binaryOp) { Instruction::BinaryOps opCode = binaryOp.getOpcode(); auto src1 = binaryOp.getOperand(0); @@ -536,7 +551,7 @@ void SpirvLowerMathFloatOp::visitBinaryOperator(BinaryOperator &binaryOp) { // Visits call instruction. // // @param callInst : Call instruction -void SpirvLowerMathFloatOp::visitCallInst(CallInst &callInst) { +void LowerMathFloatOp::visitCallInst(CallInst &callInst) { auto callee = callInst.getCalledFunction(); if (!callee) return; @@ -551,7 +566,7 @@ void SpirvLowerMathFloatOp::visitCallInst(CallInst &callInst) { // Visits fptrunc instruction. // // @param fptruncInst : Fptrunc instruction -void SpirvLowerMathFloatOp::visitFPTruncInst(FPTruncInst &fptruncInst) { +void LowerMathFloatOp::visitFPTruncInst(FPTruncInst &fptruncInst) { if (m_fp16RoundToZero) { auto src = fptruncInst.getOperand(0); auto srcTy = src->getType(); diff --git a/llpc/lower/LowerMath.h b/llpc/lowering/LowerMath.h similarity index 88% rename from llpc/lower/LowerMath.h rename to llpc/lowering/LowerMath.h index d4c30d8f4c..2c2e702f63 100644 --- a/llpc/lower/LowerMath.h +++ b/llpc/lowering/LowerMath.h @@ -30,7 +30,7 @@ */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/PassManager.h" @@ -39,9 +39,9 @@ namespace Llpc { // ===================================================================================================================== // SPIR-V lowering operations for math transformation. -class SpirvLowerMath : public SpirvLower { +class LowerMath : public SpirvLower { public: - SpirvLowerMath(); + LowerMath(); protected: void init(llvm::Module &module); @@ -56,7 +56,7 @@ class SpirvLowerMath : public SpirvLower { // ===================================================================================================================== // SPIR-V lowering operations for math constant folding. -class SpirvLowerMathConstFolding : public SpirvLowerMath, public llvm::PassInfoMixin { +class LowerMathConstFolding : public LowerMath, public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); @@ -70,22 +70,22 @@ class SpirvLowerMathConstFolding : public SpirvLowerMath, public llvm::PassInfoM // ===================================================================================================================== // SPIR-V lowering operations to adjust fast math flags. -class SpirvLowerMathPrecision : public SpirvLower, public llvm::PassInfoMixin { +class LowerMathPrecision : public SpirvLower, public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); static llvm::StringRef name() { return "Lower SPIR-V for precision (fast math flags)"; } - bool adjustExports(llvm::Module &module); + bool adjustExports(llvm::Module &module, bool clearAll); bool propagateNoContract(llvm::Module &module, bool forward, bool backward); }; // ===================================================================================================================== // SPIR-V lowering operations for math floating point optimisation. -class SpirvLowerMathFloatOp : public SpirvLowerMath, - public llvm::PassInfoMixin, - public llvm::InstVisitor { +class LowerMathFloatOp : public LowerMath, + public llvm::PassInfoMixin, + public llvm::InstVisitor { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); diff --git a/llpc/lower/LowerMemoryOp.cpp b/llpc/lowering/LowerMemoryOp.cpp similarity index 99% rename from llpc/lower/LowerMemoryOp.cpp rename to llpc/lowering/LowerMemoryOp.cpp index 7b98ea7f02..2a4e3b2799 100644 --- a/llpc/lower/LowerMemoryOp.cpp +++ b/llpc/lowering/LowerMemoryOp.cpp @@ -208,6 +208,8 @@ bool LowerMemoryOp::needExpandDynamicIndex(GetElementPtrInst *getElemPtr, unsign // NOTE: Normal SPIR-V translation won't generate this, it may come from our internally inserted // instructions to do pointer increment. allowExpand = false; + } else if (indexedTy->isFloatTy()) { + allowExpand = false; } else { llvm_unreachable("Should never be called!"); allowExpand = false; diff --git a/llpc/lower/LowerMemoryOp.h b/llpc/lowering/LowerMemoryOp.h similarity index 99% rename from llpc/lower/LowerMemoryOp.h rename to llpc/lowering/LowerMemoryOp.h index 842f4c1d8b..004f6ebbd3 100644 --- a/llpc/lower/LowerMemoryOp.h +++ b/llpc/lowering/LowerMemoryOp.h @@ -30,7 +30,7 @@ */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/PassManager.h" #include diff --git a/llpc/lower/LowerPostInline.cpp b/llpc/lowering/LowerPostInline.cpp similarity index 99% rename from llpc/lower/LowerPostInline.cpp rename to llpc/lowering/LowerPostInline.cpp index 64400f8193..1646860944 100644 --- a/llpc/lower/LowerPostInline.cpp +++ b/llpc/lowering/LowerPostInline.cpp @@ -29,9 +29,9 @@ *********************************************************************************************************************** */ #include "LowerPostInline.h" +#include "LoweringUtil.h" #include "SPIRVInternal.h" #include "llpcContext.h" -#include "llpcSpirvLowerUtil.h" #include "lgc/Builder.h" #include "lgc/Pipeline.h" #include "llvm/IR/DerivedTypes.h" diff --git a/llpc/lower/LowerPostInline.h b/llpc/lowering/LowerPostInline.h similarity index 98% rename from llpc/lower/LowerPostInline.h rename to llpc/lowering/LowerPostInline.h index b3937f2017..1224cab3e6 100644 --- a/llpc/lower/LowerPostInline.h +++ b/llpc/lowering/LowerPostInline.h @@ -30,7 +30,7 @@ */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/IR/PassManager.h" namespace Llpc { diff --git a/llpc/lower/LowerRayTracing.cpp b/llpc/lowering/LowerRayTracing.cpp similarity index 99% rename from llpc/lower/LowerRayTracing.cpp rename to llpc/lowering/LowerRayTracing.cpp index a9d0800e36..e4fc9a50b9 100644 --- a/llpc/lower/LowerRayTracing.cpp +++ b/llpc/lowering/LowerRayTracing.cpp @@ -30,11 +30,11 @@ */ #include "LowerRayTracing.h" +#include "LoweringUtil.h" #include "SPIRVInternal.h" #include "gpurt-compiler.h" #include "llpcContext.h" #include "llpcRayTracingContext.h" -#include "llpcSpirvLowerUtil.h" #include "compilerutils/CompilerUtils.h" #include "llvmraytracing/ContinuationsUtil.h" #include "llvmraytracing/GpurtContext.h" diff --git a/llpc/lower/LowerRayTracing.h b/llpc/lowering/LowerRayTracing.h similarity index 99% rename from llpc/lower/LowerRayTracing.h rename to llpc/lowering/LowerRayTracing.h index 8eeee82d1b..5a93734527 100644 --- a/llpc/lower/LowerRayTracing.h +++ b/llpc/lowering/LowerRayTracing.h @@ -30,8 +30,8 @@ */ #pragma once +#include "Lowering.h" #include "SPIRVInternal.h" -#include "llpcSpirvLower.h" #include "compilerutils/CompilerUtils.h" #include "llvm/ADT/SmallSet.h" #include "llvm/IR/PassManager.h" diff --git a/llpc/lower/LowerTerminator.cpp b/llpc/lowering/LowerTerminator.cpp similarity index 93% rename from llpc/lower/LowerTerminator.cpp rename to llpc/lowering/LowerTerminator.cpp index 6544d37896..8e88d9c758 100644 --- a/llpc/lower/LowerTerminator.cpp +++ b/llpc/lowering/LowerTerminator.cpp @@ -25,24 +25,24 @@ /** *********************************************************************************************************************** * @file LowerTerminator.cpp - * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerTerminator. + * @brief LLPC source file: contains implementation of class Llpc::LowerTerminator. * @details This pass removes trailing instructions after known terminators. * These dead instructions can occur when functions calling terminators, such as OpKill, are inlined. *********************************************************************************************************************** */ #include "LowerTerminator.h" +#include "Lowering.h" +#include "LoweringUtil.h" #include "SPIRVInternal.h" #include "llpcContext.h" #include "llpcDebug.h" -#include "llpcSpirvLower.h" -#include "llpcSpirvLowerUtil.h" #include "lgc/Builder.h" #include "llvm/IR/Instructions.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#define DEBUG_TYPE "llpc-spirv-lower-terminator" +#define DEBUG_TYPE "lower-terminator" using namespace llvm; using namespace SPIRV; @@ -55,8 +55,8 @@ namespace Llpc { // // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerTerminator::run(Module &module, ModuleAnalysisManager &analysisManager) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Terminator\n"); +PreservedAnalyses LowerTerminator::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-Terminator\n"); SpirvLower::init(&module); @@ -87,7 +87,7 @@ PreservedAnalyses SpirvLowerTerminator::run(Module &module, ModuleAnalysisManage // If found, mark dead instructions for removal and add a return immediately following the kill. // // @param callInst : "Call" instruction -void SpirvLowerTerminator::visitCallInst(CallInst &callInst) { +void LowerTerminator::visitCallInst(CallInst &callInst) { auto callee = callInst.getCalledFunction(); if (!callee) return; diff --git a/llpc/lower/LowerTerminator.h b/llpc/lowering/LowerTerminator.h similarity index 88% rename from llpc/lower/LowerTerminator.h rename to llpc/lowering/LowerTerminator.h index bb472d9768..58e548c94c 100644 --- a/llpc/lower/LowerTerminator.h +++ b/llpc/lowering/LowerTerminator.h @@ -25,12 +25,12 @@ /** *********************************************************************************************************************** * @file LowerTerminator.h - * @brief LLPC header file: contains declaration of Llpc::SpirvLowerTerminator + * @brief LLPC header file: contains declaration of Llpc::LowerTerminator *********************************************************************************************************************** */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/PassManager.h" @@ -38,9 +38,9 @@ namespace Llpc { // ===================================================================================================================== // Represents the pass of SPIR-V lowering terminators. -class SpirvLowerTerminator : public SpirvLower, - public llvm::PassInfoMixin, - public llvm::InstVisitor { +class LowerTerminator : public SpirvLower, + public llvm::PassInfoMixin, + public llvm::InstVisitor { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); diff --git a/llpc/lower/LowerTranslator.cpp b/llpc/lowering/LowerTranslator.cpp similarity index 92% rename from llpc/lower/LowerTranslator.cpp rename to llpc/lowering/LowerTranslator.cpp index 0c86246823..657b630156 100644 --- a/llpc/lower/LowerTranslator.cpp +++ b/llpc/lowering/LowerTranslator.cpp @@ -25,7 +25,7 @@ /** *********************************************************************************************************************** * @file LowerTranslator.cpp - * @brief LLPC source file: contains implementation of Llpc::SpirvLowerTranslator + * @brief LLPC source file: contains implementation of Llpc::LowerTranslator *********************************************************************************************************************** */ #include "LowerTranslator.h" @@ -36,7 +36,7 @@ #include #include -#define DEBUG_TYPE "llpc-spirv-lower-translator" +#define DEBUG_TYPE "lower-translator" using namespace llvm; using namespace Llpc; @@ -46,8 +46,8 @@ using namespace Llpc; // // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation -llvm::PreservedAnalyses SpirvLowerTranslator::run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Translator\n"); +llvm::PreservedAnalyses LowerTranslator::run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-Translator\n"); SpirvLower::init(&module); @@ -67,7 +67,7 @@ llvm::PreservedAnalyses SpirvLowerTranslator::run(llvm::Module &module, llvm::Mo // // @param shaderInfo : Specialization info // @param [in/out] module : Module to translate into, initially empty -void SpirvLowerTranslator::translateSpirvToLlvm(const PipelineShaderInfo *shaderInfo, Module *module) { +void LowerTranslator::translateSpirvToLlvm(const PipelineShaderInfo *shaderInfo, Module *module) { BinaryData optimizedSpirvBin = {}; const ShaderModuleData *moduleData = reinterpret_cast(shaderInfo->pModuleData); assert(moduleData->binType == BinaryType::Spirv); diff --git a/llpc/lower/LowerTranslator.h b/llpc/lowering/LowerTranslator.h similarity index 87% rename from llpc/lower/LowerTranslator.h rename to llpc/lowering/LowerTranslator.h index 612e150376..86416d21aa 100644 --- a/llpc/lower/LowerTranslator.h +++ b/llpc/lowering/LowerTranslator.h @@ -25,26 +25,26 @@ /** *********************************************************************************************************************** * @file LowerTranslator.h - * @brief LLPC header file: contains declaration of Llpc::SpirvLowerTranslator + * @brief LLPC header file: contains declaration of Llpc::LowerTranslator *********************************************************************************************************************** */ #pragma once -#include "llpcSpirvLower.h" +#include "Lowering.h" #include "llvm/IR/PassManager.h" namespace Llpc { // ===================================================================================================================== // Pass to translate the SPIR-V modules and generate an IR module for the whole pipeline -class SpirvLowerTranslator : public SpirvLower, public llvm::PassInfoMixin { +class LowerTranslator : public SpirvLower, public llvm::PassInfoMixin { public: - SpirvLowerTranslator() {} + LowerTranslator() {} // // @param stage : Shader stage // @param shaderInfo : Shader info for this shader - SpirvLowerTranslator(ShaderStage stage, const PipelineShaderInfo *shaderInfo, llvm::StringRef globalVarPrefix = {}) + LowerTranslator(ShaderStage stage, const PipelineShaderInfo *shaderInfo, llvm::StringRef globalVarPrefix = {}) : m_shaderInfo(shaderInfo), m_globalVarPrefix(globalVarPrefix) {} llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); @@ -57,7 +57,7 @@ class SpirvLowerTranslator : public SpirvLower, public llvm::PassInfoMixin> argPromotionsFuncs; - auto rtipVersion = m_context->getPipelineContext()->getRayTracingState()->rtIpVersion; - unsigned rtip = rtipVersion.major * 10 + rtipVersion.minor; SmallVector maybeRtFuncs; for (Function &func : module) { if (func.isDeclaration() || !func.hasName()) @@ -83,11 +81,11 @@ PreservedAnalyses ProcessGpuRtLibrary::run(Module &module, ModuleAnalysisManager StringRef funcName = func.getName(); SmallBitVector argPromotions(/*size=*/8); bool isRqFunc = false; - if (funcName.starts_with("TraceRayInline")) + if (funcName.starts_with("_RayQuery_TraceRayInline")) argPromotions.set(1, 8); - else if (funcName.starts_with("RayQueryProceed")) + else if (funcName.starts_with("_RayQuery_Proceed")) argPromotions.set(1, 3); - else if (funcName.starts_with("FetchTrianglePositionFromRayQuery")) + else if (funcName.starts_with("_RayQuery_FetchTrianglePosition")) argPromotions.set(1); else { StringRef rqFuncName = funcName; @@ -102,40 +100,6 @@ PreservedAnalyses ProcessGpuRtLibrary::run(Module &module, ModuleAnalysisManager maybeRtFuncs.push_back(&func); continue; } - - // This is a rayQuery function, and we have the args requiring promotion in the argPromotions bit vector. - // Parse off the RTIP suffix if any, e.g. "2_0", into a two-digit decimal number, e.g. 20. - // Ignore BVH8 funcs. - if (funcName.ends_with("BVH8")) - continue; - StringRef funcSuffix = funcName.take_back(3); - unsigned funcRtip = 0; - if (funcSuffix.size() == 3 && isdigit(funcSuffix[0]) && funcSuffix[1] == '_' && isdigit(funcSuffix[2])) { - funcRtip = (funcSuffix[0] - '0') * 10 + (funcSuffix[2] - '0'); - funcName = funcName.drop_back(funcSuffix.size()); - } - // If this function has an RTIP suffix but it is wrong, ignore it (leaving it as internal linkage so it gets - // removed later). - if (funcRtip != 0 && funcRtip != rtip) - continue; - - if (funcRtip != 0) { - // We have a function with the correct RTIP suffix. We want to rename it without the RTIP suffix. - // If there is another function of the same name without the RTIP suffix, take its name and make the - // other function internal so it gets removed later. (This works whether we saw that function first or - // this RTIP-suffixed one.) - if (Function *otherFunc = module.getFunction(funcName)) { - otherFunc->setLinkage(GlobalValue::InternalLinkage); - func.takeName(otherFunc); - } else { - // No other function. Set name the normal way. Note use of str() to copy the unsuffixed name out - // before setName() frees it. - func.setName(funcName.str()); - } - } - // Set external linkage on this function. - func.setLinkage(GlobalValue::WeakAnyLinkage); - if (argPromotions.any()) { // Add this function to the list that need arg promotion. // We don't do the arg promotion here as it invalidates the module iterator. @@ -151,8 +115,7 @@ PreservedAnalyses ProcessGpuRtLibrary::run(Module &module, ModuleAnalysisManager Function *func = argPromotionsFunc.first; if (func->getLinkage() == GlobalValue::InternalLinkage) continue; - Function *promotedFunc = CompilerUtils::promotePointerArguments(func, argPromotionsFunc.second); - promotedFunc->setLinkage(GlobalValue::WeakAnyLinkage); + CompilerUtils::promotePointerArguments(func, argPromotionsFunc.second); } // Process ray-tracing (i.e. non-rayQuery) functions in a separate loop; processLibraryFunction() may do diff --git a/llpc/lower/ProcessGpuRtLibrary.h b/llpc/lowering/ProcessGpuRtLibrary.h similarity index 99% rename from llpc/lower/ProcessGpuRtLibrary.h rename to llpc/lowering/ProcessGpuRtLibrary.h index 30fe5a5ca5..6f4ceb1aa6 100644 --- a/llpc/lower/ProcessGpuRtLibrary.h +++ b/llpc/lowering/ProcessGpuRtLibrary.h @@ -29,8 +29,8 @@ *********************************************************************************************************************** */ #pragma once +#include "Lowering.h" #include "SPIRVInternal.h" -#include "llpcSpirvLower.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/IR/PassManager.h" diff --git a/llpc/lowering/ScalarReplacementOfBuiltins.cpp b/llpc/lowering/ScalarReplacementOfBuiltins.cpp new file mode 100644 index 0000000000..deb8b0ab0f --- /dev/null +++ b/llpc/lowering/ScalarReplacementOfBuiltins.cpp @@ -0,0 +1,442 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file ScalarReplacementOfBuiltins.cpp + * @brief LLPC source file: split and replace global variables that are structures containing built-in values + *********************************************************************************************************************** + */ +#include "ScalarReplacementOfBuiltins.h" +#include "SPIRVInternal.h" +#include "llpcContext.h" +#include "vkgcDefs.h" +#include "spirv/spirv.hpp" +#include "lgc/Builder.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/IR/Analysis.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/ReplaceConstant.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include + +#define DEBUG_TYPE "scalar-replacement-of-builtins" + +using namespace llvm; +using namespace lgc; +using namespace Llpc; + +namespace Llpc { + +// ===================================================================================================================== +// Executes this SPIR-V lowering pass on the specified LLVM module. +// +// @param [in/out] module : LLVM module to be run on +// @param [in/out] analysisManager : Analysis manager to use for this transformation +PreservedAnalyses ScalarReplacementOfBuiltins::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG( + dbgs() << "Run the pass refactor and replace global variables that are structures containing built-in values\n"); + + bool changed = false; + SpirvLower::init(&module); + SmallVector originalGlobals(make_pointer_range(m_module->globals())); + for (auto &global : originalGlobals) { + if (!needsSplit(global)) + continue; + + // TODO: Handle the case where globalBuiltinVar is gl_in or gl_MeshVerticesEXT. + if (global->getValueType()->isStructTy()) { + splitBuiltinStructure(global); + changed = true; + } else if (global->getValueType()->isArrayTy()) { + splitBuiltinArray(global); + changed = true; + } + } + return changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +// ===================================================================================================================== +// Retrieves metadata for shader input/output elements based on their type. +// +// @param elementType : Type of the shader input/output element +// @param elementMetadata : Metadata values for initializing the metadata structure +ShaderInOutMetadata ScalarReplacementOfBuiltins::getShaderInOutMetadata(Type *elementType, Constant *elementMetadata) { + ShaderInOutMetadata inOutMeta = {}; + if (elementType->isArrayTy()) { + assert(elementMetadata->getNumOperands() == 4); + inOutMeta.U64All[0] = cast(elementMetadata->getOperand(2))->getZExtValue(); + inOutMeta.U64All[1] = cast(elementMetadata->getOperand(3))->getZExtValue(); + } else { + assert(elementMetadata->getNumOperands() == 2); + inOutMeta.U64All[0] = cast(elementMetadata->getOperand(0))->getZExtValue(); + inOutMeta.U64All[1] = cast(elementMetadata->getOperand(1))->getZExtValue(); + } + return inOutMeta; +} + +// ===================================================================================================================== +// Determine whether the structure needs to be split. +// +// @param globalBuiltinVar : Global variable containing built-in type +bool ScalarReplacementOfBuiltins::needsSplit(GlobalVariable *globalBuiltinVar) { + auto addressSpace = globalBuiltinVar->getType()->getAddressSpace(); + if (addressSpace != SPIRV::SPIRAS_Output) + return false; + + Type *valueType = globalBuiltinVar->getValueType(); + // NOTE: If the global value type to be split is a structure or array. + if (!valueType->isStructTy() && !valueType->isArrayTy()) + return false; + + MDNode *globalVarMetaNode = globalBuiltinVar->getMetadata(gSPIRVMD::InOut); + Constant *inOutMetaConst = mdconst::dyn_extract(globalVarMetaNode->getOperand(0)); + Constant *firstMemberMeta = nullptr; + Type *firstMemberTy = nullptr; + + if (valueType->isArrayTy()) { + Type *arrayElemmentTy = valueType->getArrayElementType(); + // Note: If the global value type to be split is an array, the member type must be a structure type. + // This is because, according to OpenGL specifications, members of gl_in, gl_out, and gl_MeshVerticesEXT must be of + // structure type. + if (!arrayElemmentTy->isStructTy()) + return false; + + Constant *structureMds = dyn_cast(inOutMetaConst->getOperand(1)); + + firstMemberTy = arrayElemmentTy->getStructElementType(0); + firstMemberMeta = dyn_cast(structureMds->getOperand(0)); + } else if (globalBuiltinVar->getValueType()->isStructTy()) { + // NOTE: If the global value type to be split is a structure, the first member of the structure must be a built-in + // value or a location type for compatibility variables. Only such structures can be split. + Type *globalBuiltinVarTy = globalBuiltinVar->getValueType(); + assert(globalBuiltinVarTy->isStructTy()); + + firstMemberTy = globalBuiltinVarTy->getStructElementType(0); + firstMemberMeta = cast(inOutMetaConst->getOperand(0)); + } + + // NOTE: If the first member is of structure type, we do not need to split it because gl_in, gl_out, or gl_PerVertex + // do not have any members that are of structure type. + if (firstMemberTy->isStructTy()) + return false; + ShaderInOutMetadata firstMeta = getShaderInOutMetadata(firstMemberTy, firstMemberMeta); + // Note: This condition handles only built-in and location value types. + assert(firstMeta.IsBuiltIn || firstMeta.IsLoc); + unsigned builtInId = firstMeta.Value; + if (firstMeta.IsBuiltIn) { + switch (builtInId) { + case spv::BuiltInPosition: + case spv::BuiltInPointSize: + case spv::BuiltInClipDistance: + case spv::BuiltInCullDistance: + return true; + default: + return false; + } + } else { + switch (builtInId) { + case Vkgc::GlCompatibilityInOutLocation::ClipVertex: + case Vkgc::GlCompatibilityInOutLocation::FrontColor: + case Vkgc::GlCompatibilityInOutLocation::BackColor: + case Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor: + case Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor: + case Vkgc::GlCompatibilityInOutLocation::TexCoord: + case Vkgc::GlCompatibilityInOutLocation::FogFragCoord: + return true; + default: + return false; + } + } + + return false; +} + +// ===================================================================================================================== +// Resolves the name of a built-in shader element based on its metadata. +// +// @param inOutMeta : Reference to the metadata structure describing the shader element +// @returns : The resolved name of the built-in shader element as a StringRef +StringRef ScalarReplacementOfBuiltins::getBuiltinElementName(ShaderInOutMetadata &inOutMeta) { + StringRef builtinElementName; + unsigned builtInId = inOutMeta.Value; + if (inOutMeta.IsBuiltIn) { + switch (builtInId) { + case spv::BuiltInPosition: + builtinElementName = "_gl_Position"; + break; + case spv::BuiltInPointSize: + builtinElementName = "_gl_PointSize"; + break; + case spv::BuiltInClipDistance: + builtinElementName = "_gl_ClipDistance"; + break; + case spv::BuiltInCullDistance: + builtinElementName = "_gl_CullDistance"; + break; + default: + llvm_unreachable("Not implemented"); + break; + } + } else { + switch (builtInId) { + case Vkgc::GlCompatibilityInOutLocation::ClipVertex: + builtinElementName = "_gl_ClipVertex"; + break; + case Vkgc::GlCompatibilityInOutLocation::FrontColor: + builtinElementName = "_gl_FrontColor"; + break; + case Vkgc::GlCompatibilityInOutLocation::BackColor: + builtinElementName = "_gl_BackColor"; + break; + case Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor: + builtinElementName = "_gl_FrontSecondaryColor"; + break; + case Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor: + builtinElementName = "_gl_BackSecondaryColor"; + break; + case Vkgc::GlCompatibilityInOutLocation::TexCoord: + builtinElementName = "_gl_TexCoord"; + break; + case Vkgc::GlCompatibilityInOutLocation::FogFragCoord: + builtinElementName = "_gl_FogFragCoord"; + break; + default: + llvm_unreachable("Not implemented"); + break; + } + } + return builtinElementName; +} + +// ===================================================================================================================== +// Removes unused newly created built-in global variables. +// +// @param elements : Vector of users associated with newly created global variables +void ScalarReplacementOfBuiltins::cleanUpUnusedGlobals(SmallVector &elements) { + for (User *user : make_early_inc_range(elements)) { + GlobalVariable *globalValueReplace = cast(user); + if (globalValueReplace->users().empty()) { + globalValueReplace->dropAllReferences(); + globalValueReplace->eraseFromParent(); + } + } + return; +} + +// ===================================================================================================================== +// Replaces users of a global variable with newly created global variables. +// +// @param globalBuiltinVar : Global variable containing built-in type +// @param elements : Vector of users associated with newly created global variables +void ScalarReplacementOfBuiltins::replaceGlobalBuiltinVar(GlobalVariable *globalBuiltinVar, + SmallVector &elements) { + convertUsersOfConstantsToInstructions(globalBuiltinVar); + for (User *user : make_early_inc_range(globalBuiltinVar->users())) { + if (StoreInst *storeInst = dyn_cast(user)) { + [[maybe_unused]] const DataLayout &dataLayout = storeInst->getModule()->getDataLayout(); + GlobalVariable *globalVar = cast(elements[0]); + assert(dataLayout.getTypeStoreSize(storeInst->getValueOperand()->getType()) <= + dataLayout.getTypeStoreSize(globalVar->getValueType())); + storeInst->replaceUsesOfWith(globalBuiltinVar, globalVar); + } else if (LoadInst *loadInst = dyn_cast(user)) { + GlobalVariable *LoadValue = cast(elements[0]); + loadInst->replaceUsesOfWith(globalBuiltinVar, LoadValue); + } else if (auto *gepInst = dyn_cast(user)) { + SmallVector indices; + GlobalVariable *globalValueReplace = nullptr; + Type *globalValueReplaceTy = nullptr; + unsigned index = UINT_MAX; + + if (globalBuiltinVar->getValueType()->isStructTy()) { + // Note: The newly generated global variables are created based on the elements of the original global structure + // variable. Therefore, when encountering a GetElementPtr (GEP) instruction, we utilize the second operand to + // determine which of the newly generated global variables corresponds to a specific element in the original + // structure. + // Example: + // GEP Instruction: getelementptr ({ <4 x float>, float... }, ptr addrspace(65) @0, i32 0, i32 4) + // Here, `gepInst->idx_begin() + 1` retrieves the index to access the fourth element of the + // original structure (0-indexed), which corresponds to the fourth newly created global variable. + // This allows matching the GEP indices with the corresponding split global variables. + index = cast(gepInst->idx_begin() + 1)->getZExtValue(); + indices.push_back(*(gepInst->idx_begin())); + unsigned int numIndices = gepInst->getNumIndices(); + if (numIndices >= 3) + indices.append(gepInst->idx_begin() + 2, gepInst->idx_end()); + assert(cast(indices[0])->isZero() && "Non-zero GEP first index\n"); + } else if (globalBuiltinVar->getValueType()->isArrayTy()) { + // Note: The newly generated global variables are derived from the elements of the original array. + // When processing a GetElementPtr (GEP) instruction that navigates through such an array, the third operand + // (after the base pointer and the initial index which is typically zero) indicates the specific element + // in the array that is being accessed. + // Example: + // GEP Instruction: getelementptr [3 x { <4 x float>, ... }], ptr addrspace(65) @gl_out, i32 0, i32 %5, i32 4 + // In this example, `gepInst->idx_begin() + 2` corresponds to `i32 4`, which is used to access the fourth + // element of the array (0-indexed). This element index is used to determine the appropriate newly created + // global variable that corresponds to this element in the original array structure. This indexing helps in + // directly mapping the GEP instruction indices to the split global variables. + index = cast(gepInst->idx_begin() + 2)->getZExtValue(); + for (auto it = gepInst->idx_begin(); it != gepInst->idx_end(); ++it) { + if (it - gepInst->idx_begin() == 2) + continue; + indices.push_back(*it); + } + } else { + llvm_unreachable("Not implemented"); + } + + globalValueReplace = cast(elements[index]); + globalValueReplaceTy = globalValueReplace->getValueType(); + m_builder->SetInsertPoint(gepInst); + Value *gepElement = + m_builder->CreateGEP(globalValueReplaceTy, elements[index], indices, "", + gepInst->isInBounds() ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none()); + gepInst->replaceAllUsesWith(gepElement); + gepInst->eraseFromParent(); + } else { + llvm_unreachable("Not implemented"); + } + } + return; +} + +// ===================================================================================================================== +// Splits a global variable of structure type containing built-in elements into individual components. +// +// @param globalBuiltinVar : Global variable containing built-in type +void ScalarReplacementOfBuiltins::splitBuiltinStructure(GlobalVariable *globalBuiltinVar) { + SmallVector elements; + StringRef prefixName = globalBuiltinVar->getName(); + MDNode *metaNode = globalBuiltinVar->getMetadata(gSPIRVMD::InOut); + assert(metaNode); + Constant *inOutMetaConst = mdconst::extract(metaNode->getOperand(0)); + Type *globalBuiltinVarTy = globalBuiltinVar->getValueType(); + assert(globalBuiltinVarTy->isStructTy()); + auto structElementCount = globalBuiltinVarTy->getStructNumElements(); + assert(structElementCount == inOutMetaConst->getType()->getStructNumElements()); + + for (unsigned idx = 0; idx < structElementCount; ++idx) { + Type *elementType = globalBuiltinVarTy->getStructElementType(idx); + Constant *elementMetadata = cast(inOutMetaConst->getOperand(idx)); + ShaderInOutMetadata inOutMeta = getShaderInOutMetadata(elementType, elementMetadata); + + // Note: This condition handles only built-in and location value types. + assert(inOutMeta.IsBuiltIn || inOutMeta.IsLoc); + StringRef builtinElementName = getBuiltinElementName(inOutMeta); + GlobalVariable *replacementBuiltinVar = new GlobalVariable( + *m_module, elementType, false, GlobalValue::ExternalLinkage, nullptr, prefixName + builtinElementName, nullptr, + GlobalVariable::NotThreadLocal, SPIRV::SPIRAS_Output); + + replacementBuiltinVar->addMetadata(gSPIRVMD::InOut, + *MDNode::get(*m_context, {ConstantAsMetadata::get(elementMetadata)})); + elements.push_back(replacementBuiltinVar); + } + + // NOTE: Replace global variable users. + replaceGlobalBuiltinVar(globalBuiltinVar, elements); + + // Cleans up unused newly created built-in global variables. + cleanUpUnusedGlobals(elements); + + globalBuiltinVar->dropAllReferences(); + globalBuiltinVar->eraseFromParent(); + return; +} + +// ===================================================================================================================== +// Splits a global variable of array type containing built-in elements into individual components. +// +// @param globalBuiltinVar : Global variable containing built-in type +void ScalarReplacementOfBuiltins::splitBuiltinArray(GlobalVariable *globalBuiltinVar) { + assert(globalBuiltinVar->getValueType()->getArrayElementType()->isStructTy()); + Type *arrayElemmentTy = globalBuiltinVar->getValueType()->getArrayElementType(); + auto structureElementNum = arrayElemmentTy->getStructNumElements(); + StringRef prefixName = globalBuiltinVar->getName(); + auto arrayElementNum = globalBuiltinVar->getValueType()->getArrayNumElements(); + SmallVector elements; + MDNode *globalVarMetaNode = globalBuiltinVar->getMetadata(gSPIRVMD::InOut); + assert(globalVarMetaNode); + Constant *inOutMetaConst = mdconst::dyn_extract(globalVarMetaNode->getOperand(0)); + Constant *structureMds = dyn_cast(inOutMetaConst->getOperand(1)); + auto int32Type = m_builder->getInt32Ty(); + auto int64Type = m_builder->getInt64Ty(); + + for (int idx = 0; idx < structureElementNum; ++idx) { + Constant *memberMeta = dyn_cast(structureMds->getOperand(idx)); + assert(memberMeta && "memberMeta should not be null"); + + Type *memberElementTy = arrayElemmentTy->getStructElementType(idx); + ShaderInOutMetadata inOutMeta = getShaderInOutMetadata(memberElementTy, memberMeta); + auto builtInId = inOutMeta.Value; + ArrayType *replaceElementTy = ArrayType::get(memberElementTy, arrayElementNum); + // Note: This condition handles only built-in and location value types. + assert((inOutMeta.IsBuiltIn || inOutMeta.IsLoc) && "Expected built-in or location metadata"); + StringRef builtinElementName = getBuiltinElementName(inOutMeta); + + GlobalVariable *replaceBuiltinElement = + new GlobalVariable(*m_module, replaceElementTy, globalBuiltinVar->isConstant(), globalBuiltinVar->getLinkage(), + nullptr, prefixName + builtinElementName, nullptr, globalBuiltinVar->getThreadLocalMode(), + globalBuiltinVar->getType()->getAddressSpace()); + + ShaderInOutMetadata memberInOutMd = {}; + memberInOutMd.IsBuiltIn = inOutMeta.IsBuiltIn; + memberInOutMd.IsLoc = inOutMeta.IsLoc; + memberInOutMd.Value = builtInId; + + Type *elmdTy = memberMeta->getType(); + StructType *mdTy = StructType::get(*m_context, {int32Type, elmdTy, int64Type, int64Type}); + SmallVector mdValues; + mdValues.push_back(ConstantInt::get(int32Type, 1)); + mdValues.push_back(memberMeta); + mdValues.push_back(ConstantInt::get(int64Type, memberInOutMd.U64All[0])); + mdValues.push_back(ConstantInt::get(int64Type, memberInOutMd.U64All[1])); + + Constant *mdVariable = ConstantStruct::get(mdTy, mdValues); + replaceBuiltinElement->addMetadata(gSPIRVMD::InOut, + *MDNode::get(*m_context, {ConstantAsMetadata::get(mdVariable)})); + elements.push_back(replaceBuiltinElement); + } + + replaceGlobalBuiltinVar(globalBuiltinVar, elements); + + // Cleans up unused newly created built-in global variables. + cleanUpUnusedGlobals(elements); + globalBuiltinVar->dropAllReferences(); + globalBuiltinVar->eraseFromParent(); + return; +} + +} // namespace Llpc diff --git a/llpc/lowering/ScalarReplacementOfBuiltins.h b/llpc/lowering/ScalarReplacementOfBuiltins.h new file mode 100644 index 0000000000..20ae5787ed --- /dev/null +++ b/llpc/lowering/ScalarReplacementOfBuiltins.h @@ -0,0 +1,59 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file ScalarReplacementOfBuiltins.h + * @brief LLPC header file: split and replace global variables that are structures containing built-in values. + *********************************************************************************************************************** + */ +#pragma once + +#include "Lowering.h" +#include "SPIRVInternal.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/PassManager.h" + +namespace Llpc { + +// ===================================================================================================================== +// Pass that splits and replaces global variables that are structures containing built-in values +class ScalarReplacementOfBuiltins : public SpirvLower, public llvm::PassInfoMixin { +public: + ScalarReplacementOfBuiltins() {} + llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); + + static llvm::StringRef name() { return "Scalar replacement of builtins"; } + +private: + ShaderInOutMetadata getShaderInOutMetadata(Type *elementType, Constant *elementMetadata); + bool needsSplit(GlobalVariable *builtinGlobalVar); + StringRef getBuiltinElementName(ShaderInOutMetadata &inOutMeta); + void cleanUpUnusedGlobals(SmallVector &elements); + void replaceGlobalBuiltinVar(GlobalVariable *builtinGlobalVar, SmallVector &elements); + void splitBuiltinStructure(GlobalVariable *builtinGlobalVar); + void splitBuiltinArray(GlobalVariable *builtinGlobalVar); +}; + +} // namespace Llpc diff --git a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm index 014c4f991f..44f5f4b79f 100644 --- a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm +++ b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py ; BEGIN_SHADERTEST -; RUN: amdllpc --print-after=llpc-spirv-lower-translator -filetype=asm -o - 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s +; RUN: amdllpc --print-after=lower-translator -filetype=asm -o - 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s ; #version 450 ; #extension GL_EXT_nonuniform_qualifier : require ; #extension GL_ARB_gpu_shader_int64 : require diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp index 90659fc955..24e47f5333 100644 --- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp +++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp @@ -81,7 +81,7 @@ void main() ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0) ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0) ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16(i32 9, i32 8, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0) -; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.i32(float 9.000000e+00, i32 7, i32 7, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0) +; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.{{i32|i16}}(float 9.000000e+00, {{i32|i16}} 7, {{i32|i16}} 7, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0) ; SHADERTEST: AMDLLPC SUCCESS */ diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag index e5eb6a602c..a9e619ecc8 100644 --- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag +++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag @@ -83,7 +83,7 @@ void main() ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.cube.i32.i16(i32 %{{.*}}, i16 2, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0) ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.cube.i32.i16(i32 %{{.*}}, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0) ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.cube.i32.i16(i32 %{{.*}}, i32 17, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0) -; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.i32(float %{{[-0-9A-Za0z_.]+}}, i32 3, i32 3, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0) +; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.{{i32|i16}}(float %{{[-0-9A-Za0z_.]+}}, {{i32|i16}} 3, {{i32|i16}} 3, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0) ; SHADERTEST: AMDLLPC SUCCESS */ diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp index d10801438c..72027ec915 100644 --- a/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp +++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp @@ -61,7 +61,6 @@ void main() // BEGIN_SHADERTEST /* ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; REQUIRES: do-not-run-me ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results ; SHADERTEST: @{{.*}} = addrspace(3) global i32 diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp index 2a3e1af505..b7727a2673 100644 --- a/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp +++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp @@ -80,7 +80,6 @@ void main () // BEGIN_SHADERTEST /* ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; REQUIRES: do-not-run-me ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results ; SHADERTEST: atomicrmw umin ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic ; SHADERTEST: atomicrmw umax ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic diff --git a/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp b/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp index b27b475c72..8058ea943b 100644 --- a/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp +++ b/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp @@ -1,5 +1,6 @@ // NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py // RUN: amdllpc -o - -filetype=asm %s | FileCheck -check-prefixes=CHECK %s +// REQUIRES: do-not-run-me #version 450 #extension GL_KHR_shader_subgroup_arithmetic : require @@ -27,12 +28,9 @@ void main() { // CHECK-NEXT: v_lshlrev_b32_e32 v4, 2, v4 // CHECK-NEXT: s_waitcnt lgkmcnt(0) // CHECK-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen -// CHECK-NEXT: s_waitcnt vmcnt(0) -// CHECK-NEXT: v_mov_b32_e32 v0, v5 -// CHECK-NEXT: s_not_b64 exec, exec -// CHECK-NEXT: v_mov_b32_e32 v0, 0xff800000 -// CHECK-NEXT: s_not_b64 exec, exec // CHECK-NEXT: s_or_saveexec_b64 s[0:1], -1 +// CHECK-NEXT: s_waitcnt vmcnt(0) +// CHECK-NEXT: v_cndmask_b32_e64 v0, 0xff800000, v5, s[0:1] // CHECK-NEXT: v_max_f32_dpp v0, v0, v0 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1 // CHECK-NEXT: v_max_f32_dpp v0, v0, v0 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 // CHECK-NEXT: v_max_f32_dpp v0, v0, v0 row_half_mirror row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -44,18 +42,13 @@ void main() { // CHECK-NEXT: v_max_f32_e64 v0, s2, s3 // CHECK-NEXT: s_mov_b64 exec, s[0:1] // CHECK-NEXT: v_mov_b32_e32 v5, v0 -// CHECK-NEXT: s_or_saveexec_b64 s[0:1], -1 -// CHECK-NEXT: v_mov_b32_e32 v0, 0xff800000 -// CHECK-NEXT: s_mov_b64 exec, s[0:1] -// CHECK-NEXT: v_mov_b32_e32 v1, v5 -// CHECK-NEXT: s_not_b64 exec, exec -// CHECK-NEXT: v_mov_b32_e32 v1, 0xff800000 -// CHECK-NEXT: s_not_b64 exec, exec // CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1 -// CHECK-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf +// CHECK-NEXT: v_mov_b32_e32 v0, 0xff800000 +// CHECK-NEXT: v_cndmask_b32_e64 v1, 0xff800000, v5, s[2:3] // CHECK-NEXT: v_mov_b32_e32 v2, 0xff800000 // CHECK-NEXT: s_mov_b32 vcc_lo, 0xffff0000 // CHECK-NEXT: s_mov_b32 vcc_hi, vcc_lo +// CHECK-NEXT: v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf // CHECK-NEXT: v_max_f32_e32 v0, v1, v0 // CHECK-NEXT: v_mov_b32_e32 v1, 0xff800000 // CHECK-NEXT: v_mov_b32_dpp v2, v0 row_shr:2 row_mask:0xf bank_mask:0xf @@ -76,15 +69,12 @@ void main() { // CHECK-NEXT: v_max_f32_e32 v0, v0, v1 // CHECK-NEXT: s_mov_b64 exec, s[2:3] // CHECK-NEXT: v_mov_b32_e32 v5, v0 -// CHECK-NEXT: v_mov_b32_e32 v0, v5 -// CHECK-NEXT: s_not_b64 exec, exec -// CHECK-NEXT: v_mov_b32_e32 v0, 0xff800000 -// CHECK-NEXT: s_not_b64 exec, exec // CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +// CHECK-NEXT: v_cndmask_b32_e64 v0, 0xff800000, v5, s[8:9] // CHECK-NEXT: s_mov_b32 s2, 0x6543210f // CHECK-NEXT: v_mov_b32_e32 v2, 0xff800000 -// CHECK-NEXT: v_permlane16_b32 v0, v0, s2, 0xedcba987 op_sel:[1,0] // CHECK-NEXT: v_mov_b32_e32 v3, 0xff800000 +// CHECK-NEXT: v_permlane16_b32 v0, v0, s2, 0xedcba987 op_sel:[1,0] // CHECK-NEXT: v_readlane_b32 s2, v0, 16 // CHECK-NEXT: v_writelane_b32 v0, s2, 48 // CHECK-NEXT: s_mov_b32 s2, 0xff800000 @@ -114,11 +104,8 @@ void main() { // CHECK-NEXT: v_max_f32_e32 v0, v0, v1 // CHECK-NEXT: s_mov_b64 exec, s[8:9] // CHECK-NEXT: v_mov_b32_e32 v5, v0 -// CHECK-NEXT: v_mov_b32_e32 v0, v5 -// CHECK-NEXT: s_not_b64 exec, exec -// CHECK-NEXT: v_mov_b32_e32 v0, 0xff800000 -// CHECK-NEXT: s_not_b64 exec, exec // CHECK-NEXT: s_or_saveexec_b64 s[0:1], -1 +// CHECK-NEXT: v_cndmask_b32_e64 v0, 0xff800000, v5, s[0:1] // CHECK-NEXT: v_max_f32_dpp v0, v0, v0 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1 // CHECK-NEXT: v_max_f32_dpp v0, v0, v0 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 // CHECK-NEXT: v_max_f32_dpp v0, v0, v0 row_half_mirror row_mask:0xf bank_mask:0xf bound_ctrl:1 diff --git a/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert b/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert index 18f2387123..76a6e50288 100644 --- a/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert +++ b/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert @@ -22,8 +22,8 @@ void main() ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=WITHOUT_IIE %s ; WITHOUT_IIE-LABEL: {{^// LLPC}} pipeline before-patching results ; WITHOUT_IIE: %[[val:.*]] = extractvalue [4 x <4 x float>] %{{.*}}, 3 -; WITHOUT_IIE: %[[mul:.*]] = fmul <4 x float> %[[val]], %{{.*}} -; WITHOUT_IIE: %[[arg:.*]] = fadd <4 x float> %{{.*}}, %[[mul]] +; WITHOUT_IIE: %[[mul:.*]] = fmul nnan nsz <4 x float> %[[val]], %{{.*}} +; WITHOUT_IIE: %[[arg:.*]] = fadd nnan nsz <4 x float> %{{.*}}, %[[mul]] ; WITHOUT_IIE-NEXT: call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> %[[arg]]) ; WITHOUT_IIE: AMDLLPC SUCCESS */ diff --git a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert index 30c0a02033..ec588303a7 100644 --- a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert +++ b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert @@ -29,8 +29,8 @@ void main() // //. // CHECK: attributes #[[ATTR0]] = { alwaysinline nounwind "denormal-fp-math-f32"="preserve-sign" } -// CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } -// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn memory(read) } +// CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) } +// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind } //. // CHECK: [[META0:![0-9]+]] = !{!"Vulkan"} // CHECK: [[META1:![0-9]+]] = !{i32 1} diff --git a/llpc/test/shaderdb/extensions/ExtShaderInt64_TestRelationalOp_lit.frag b/llpc/test/shaderdb/extensions/ExtShaderInt64_TestRelationalOp_lit.frag index 4fb6f2cbf6..bb5734d317 100644 --- a/llpc/test/shaderdb/extensions/ExtShaderInt64_TestRelationalOp_lit.frag +++ b/llpc/test/shaderdb/extensions/ExtShaderInt64_TestRelationalOp_lit.frag @@ -1,6 +1,5 @@ // NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py // RUN: amdllpc -emit-lgc -gfxip 10.3 -o - %s | FileCheck -check-prefix=SHADERTEST %s -// REQUIRES: do-not-run-me #version 450 @@ -37,9 +36,9 @@ void main() // SHADERTEST-NEXT: .entry: // SHADERTEST-NEXT: [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0) // SHADERTEST-NEXT: [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]]) -// SHADERTEST-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 32 +// SHADERTEST-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP0]], i32 32 // SHADERTEST-NEXT: [[TMP3:%.*]] = load <3 x i64>, ptr addrspace(7) [[TMP2]], align 32 -// SHADERTEST-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 64 +// SHADERTEST-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP0]], i32 64 // SHADERTEST-NEXT: [[TMP5:%.*]] = load <3 x i64>, ptr addrspace(7) [[TMP4]], align 32 // SHADERTEST-NEXT: [[TMP6:%.*]] = extractelement <3 x i64> [[TMP3]], i64 0 // SHADERTEST-NEXT: [[TMP7:%.*]] = extractelement <3 x i64> [[TMP5]], i64 0 @@ -114,7 +113,7 @@ void main() // SHADERTEST: 67: // SHADERTEST-NEXT: [[DOT022_IN:%.*]] = phi <3 x i1> [ [[TMP51]], [[TMP37]] ], [ [[TMP66]], [[TMP52]] ] // SHADERTEST-NEXT: [[TMP68:%.*]] = load i64, ptr addrspace(7) [[TMP0]], align 8 -// SHADERTEST-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 8 +// SHADERTEST-NEXT: [[TMP69:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP0]], i32 8 // SHADERTEST-NEXT: [[TMP70:%.*]] = load i64, ptr addrspace(7) [[TMP69]], align 8 // SHADERTEST-NEXT: [[TMP71:%.*]] = icmp ne i64 [[TMP68]], [[TMP70]] // SHADERTEST-NEXT: [[COND_FREEZE4:%.*]] = freeze i1 [[TMP71]] diff --git a/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp b/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp index 48598a48f4..ab2da40227 100644 --- a/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp +++ b/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp @@ -42,7 +42,6 @@ void main() // BEGIN_SHADERTEST /* ; RUN: amdllpc -enable-load-scalarizer=false -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; REQUIRES: do-not-run-me ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results ; SHADERTEST-COUNT-4: getelementptr {{.*}}[4 x { i8, <2 x i8>, <3 x i8>, <4 x i8> }], ptr addrspace(3) @{{.*}}, i32 0, i32 {{%?[0-9]+}}, i32 {{[0-3]}} diff --git a/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport.spvasm b/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport.spvasm new file mode 100644 index 0000000000..a191df60db --- /dev/null +++ b/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport.spvasm @@ -0,0 +1,50 @@ +; SPIR-V +; Version: 1.6 +; Generator: Khronos Glslang Reference Front End; 11 +; Bound: 19 +; Schema: 0 + OpCapability Shader + %1 = OpExtInstImport "GLSL.std.450" + OpMemoryModel Logical GLSL450 + OpEntryPoint Fragment %main "main" %_ + OpExecutionMode %main OriginUpperLeft + OpSource GLSL 450 + OpName %main "main" + OpName %UniformData "UniformData" + OpMemberName %UniformData 0 "valueNonZero" + OpName %_ "" + OpMemberDecorate %UniformData 0 Offset 0 + OpDecorate %UniformData Block + OpDecorate %_ DescriptorSet 0 + OpDecorate %_ Binding 0 + %void = OpTypeVoid + %3 = OpTypeFunction %void + %int = OpTypeInt 32 1 +%UniformData = OpTypeStruct %int +%_ptr_Uniform_UniformData = OpTypePointer Uniform %UniformData + %_ = OpVariable %_ptr_Uniform_UniformData Uniform + %int_0 = OpConstant %int 0 +%_ptr_Uniform_int = OpTypePointer Uniform %int + %bool = OpTypeBool + %main = OpFunction %void None %3 + %5 = OpLabel + %12 = OpAccessChain %_ptr_Uniform_int %_ %int_0 + %13 = OpLoad %int %12 + %15 = OpINotEqual %bool %13 %int_0 + OpSelectionMerge %17 None + OpBranchConditional %15 %16 %17 + %16 = OpLabel + OpTerminateInvocation + %17 = OpLabel + OpReturn + OpFunctionEnd + +; BEGIN_SHADERTEST +; RUN: amdllpc -v %gfxip %s | FileCheck --check-prefix=SHADERTEST %s +; SHADERTEST-LABEL: {{^//}} LLPC final ELF info +; SHADERTEST: .cb_shader_mask: +; SHADERTEST-NEXT: .output0_enable: 0x0000000000000001 +; SHADERTEST: .spi_shader_col_format: +; SHADERTEST-NEXT: .col_0_export_format: 0x0000000000000001 +; SHADERTEST: AMDLLPC SUCCESS +; END_SHADERTEST diff --git a/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport4.pipe b/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport4.pipe new file mode 100644 index 0000000000..e71c1b5a7f --- /dev/null +++ b/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport4.pipe @@ -0,0 +1,65 @@ +[Version] +version = 75 + +[VsGlsl] +#version 450 core +void main() {} + +[VsInfo] +entryPoint = main + +[FsGlsl] +#version 450 core +layout(set = 0, binding = 0, std140) uniform UniformData +{ + int valueNonZero; +}; + +void main() { + if (valueNonZero != 0) + discard; +} + +[FsInfo] +entryPoint = main + +[GraphicsPipelineState] +colorBuffer[0].format = VK_FORMAT_R8G8B8A8_SRGB +colorBuffer[0].channelWriteMask = 7 +colorBuffer[0].blendEnable = 0 +colorBuffer[0].blendSrcAlphaToColor = 0 + +[ResourceMapping] +descriptorRangeValue[0].visibility = 66 +descriptorRangeValue[0].type = DescriptorConstBuffer +descriptorRangeValue[0].set = 0 +descriptorRangeValue[0].binding = 0 +descriptorRangeValue[0].arraySize = 1 +descriptorRangeValue[0].uintData = 134217874, 16773120, 603979776, 0 + +userDataNode[0].visibility = 2 +userDataNode[0].type = IndirectUserDataVaPtr +userDataNode[0].offsetInDwords = 0 +userDataNode[0].sizeInDwords = 1 +userDataNode[0].indirectUserDataCount = 4 +userDataNode[1].visibility = 66 +userDataNode[1].type = DescriptorTableVaPtr +userDataNode[1].offsetInDwords = 8 +userDataNode[1].sizeInDwords = 1 +userDataNode[1].next[0].type = DescriptorConstBuffer +userDataNode[1].next[0].offsetInDwords = 0 +userDataNode[1].next[0].sizeInDwords = 4 +userDataNode[1].next[0].set = 0x00000000 +userDataNode[1].next[0].binding = 0 +userDataNode[1].next[0].strideInDwords = 8 + + +; BEGIN_SHADERTEST +; RUN: amdllpc -v %gfxip %s | FileCheck --check-prefix=SHADERTEST %s +; SHADERTEST-LABEL: {{^//}} LLPC final ELF info +; SHADERTEST: .cb_shader_mask: +; SHADERTEST-NEXT: .output0_enable: 0x000000000000000F +; SHADERTEST: .spi_shader_col_format: +; SHADERTEST-NEXT: .col_0_export_format: 0x0000000000000004 +; SHADERTEST: AMDLLPC SUCCESS +; END_SHADERTEST diff --git a/llpc/test/shaderdb/general/PipelineGsTess_TestInOutPacking.pipe b/llpc/test/shaderdb/general/PipelineGsTess_TestInOutPacking.pipe index a371688436..133d0d4913 100644 --- a/llpc/test/shaderdb/general/PipelineGsTess_TestInOutPacking.pipe +++ b/llpc/test/shaderdb/general/PipelineGsTess_TestInOutPacking.pipe @@ -17,11 +17,11 @@ ; SHADERTEST: (GS) Output: stream = 0, [location, component] = [2, 1] => Mapped = [1, 3] ; SHADERTEST: (GS) Output: stream = 0, [location, component] = [4, 0] => Mapped = [2, 0] ; SHADERTEST: (GS) Output: stream = 0, [location, component] = [4, 1] => Mapped = [2, 1] -; SHADERTEST: (GS) Output: stream = 1, [location, component] = [3, 0] => Mapped = [2, 0] -; SHADERTEST: (GS) Output: stream = 1, [location, component] = [3, 1] => Mapped = [2, 1] -; SHADERTEST: (GS) Output: stream = 1, [location, component] = [3, 2] => Mapped = [2, 2] -; SHADERTEST: (GS) Output: stream = 1, [location, component] = [3, 3] => Mapped = [2, 3] -; SHADERTEST: (GS) Output: stream = 1, [location, component] = [4, 3] => Mapped = [3, 0] +; SHADERTEST: (GS) Output: stream = 1, [location, component] = [3, 0] => Mapped = [0, 0] +; SHADERTEST: (GS) Output: stream = 1, [location, component] = [3, 1] => Mapped = [0, 1] +; SHADERTEST: (GS) Output: stream = 1, [location, component] = [3, 2] => Mapped = [0, 2] +; SHADERTEST: (GS) Output: stream = 1, [location, component] = [3, 3] => Mapped = [0, 3] +; SHADERTEST: (GS) Output: stream = 1, [location, component] = [4, 3] => Mapped = [1, 0] ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results ; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 32, i32 15 ; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 33, i32 15 diff --git a/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe b/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe index d432d16530..87b36861ba 100644 --- a/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe +++ b/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe @@ -2,7 +2,7 @@ ; BEGIN_SHADERTEST ; REQUIRES: gpurt -; RUN: amdllpc --print-after=llpc-spirv-lower-translator -gfxip 10.3 -o /dev/null 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s +; RUN: amdllpc --print-after=lower-translator -gfxip 10.3 -o /dev/null 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s ; SHADERTEST-LABEL: @main( ; SHADERTEST: call void (...) @lgc.rt.trace.ray(i64 %{{[0-9]+}}, i32 0, i32 %{{[0-9]+}}, i32 0, i32 0, i32 0, <3 x float> %{{[0-9]+}}, float %{{[0-9]+}}, <3 x float> %{{[0-9]+}}, float %{{[0-9]+}}, ptr addrspace(5) @RayPayloadKHR0, [1 x i32] [i32 16]) ; END_SHADERTEST diff --git a/llpc/test/shaderdb/general/PipelineTess_XfbWithManyComponents.pipe b/llpc/test/shaderdb/general/PipelineTess_XfbWithManyComponents.pipe index 468e4376e0..4533eca767 100644 --- a/llpc/test/shaderdb/general/PipelineTess_XfbWithManyComponents.pipe +++ b/llpc/test/shaderdb/general/PipelineTess_XfbWithManyComponents.pipe @@ -5,8 +5,8 @@ ; BEGIN_SHADERTEST ; RUN: amdllpc -v -gfxip=11 %s | FileCheck -check-prefix=SHADERTEST %s -; SHADERTEST-LABEL: LLPC geometry calculation factor results -; SHADERTEST: ES-GS ring item size (in dwords): 129 +; SHADERTEST-LABEL: LLPC HW GS configurations +; SHADERTEST: EsGsRingItemSize = 129 dwords ; SHADERTEST-LABEL: .fetchXfbOutput ; Write v4[31] = 4.0 -> LDS diff --git a/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe b/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe index 425456fb11..61e32230f7 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --function-signature -; RUN: amdllpc -stop-after=lgc-patch-entry-point-mutate -o - %s | FileCheck -check-prefixes=SHADERTEST %s +; RUN: amdllpc -stop-after=lgc-mutate-entry-point -o - %s | FileCheck -check-prefixes=SHADERTEST %s [Version] version = 64 @@ -115,7 +115,6 @@ attribute[1].offset = 16 ; SHADERTEST-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP24]], i32 3 ; SHADERTEST-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP31]], i32 3 ; SHADERTEST-NEXT: [[VERTEX0_0:%.*]] = bitcast <4 x i32> [[TMP32]] to <4 x float> -; SHADERTEST-NEXT: call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> [[VERTEX0_0]]) #[[ATTR7:[0-9]+]] ; SHADERTEST-NEXT: [[TMP33:%.*]] = extractelement <2 x float> [[VERTEX1_0]], i64 0 ; SHADERTEST-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[VERTEX1_0]], i64 1 ; SHADERTEST-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[VERTEX1_0]], i64 0 @@ -132,7 +131,8 @@ attribute[1].offset = 16 ; SHADERTEST-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP45]], i64 2 ; SHADERTEST-NEXT: [[TMP47:%.*]] = bitcast i32 [[TMP40]] to float ; SHADERTEST-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP47]], i64 3 -; SHADERTEST-NEXT: call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP48]]) #[[ATTR7]] +; SHADERTEST-NEXT: call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP48]]) #[[ATTR7:[0-9]+]] +; SHADERTEST-NEXT: call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> [[VERTEX0_0]]) #[[ATTR7]] ; SHADERTEST-NEXT: ret void ; ; diff --git a/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe b/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe index ac195358cc..bdbb83a8e3 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe @@ -2,9 +2,11 @@ // instruction combine removing subtraction for gl_Position computation. ; BEGIN_SHADERTEST -; RUN: amdllpc --gfxip=10.3.0 -v %s | FileCheck -check-prefix=SHADERTEST %s +; RUN: amdllpc --gfxip=10.3.0 -v %s | FileCheck -check-prefixes=SHADERTEST,OPT %s +; RUN: amdllpc --gfxip=10.3.0 --disable-gl-position-opt=1 -v %s | FileCheck -check-prefixes=SHADERTEST,NOOPT %s ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results -; SHADERTEST: fsub float 1.000000e+00, %__llpc_input_proxy_in_Pos.0.vec.extract +; OPT: fsub nnan nsz float 1.000000e+00, %__llpc_input_proxy_in_Pos.0.vec.extract +; NOOPT: fsub float 1.000000e+00, %__llpc_input_proxy_in_Pos.0.vec.extract ; SHADERTEST-LABEL: _amdgpu_vs_main: ; SHADERTEST: v_sub_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} ; END_SHADERTEST @@ -51,5 +53,5 @@ binding[0].stride = 2 binding[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX attribute[0].location = 0 attribute[0].binding = 0 -attribute[0].format = VK_FORMAT_R8G8_SNORM +attribute[0].format = VK_FORMAT_R8G8B8A8_SNORM attribute[0].offset = 0 diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestBarycentric_tri_fan.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestBarycentric_tri_fan.pipe index 8d23a38228..5bad45c791 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_TestBarycentric_tri_fan.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_TestBarycentric_tri_fan.pipe @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --function amdgpu_ps_main ; RUN: amdllpc -filetype=asm -gfxip=10.3 -o - %s | FileCheck -check-prefix=SHADERTEST %s -; REQUIRES: do-not-run ; This test tests barycentric coordinate when topology is VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN ; barycentric coordinate: (i ,j , 1 - i - j). diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe index c1c9296094..5b7d55e278 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe @@ -217,88 +217,4 @@ colorBuffer[0].blendSrcAlphaToColor = 0 ; CHECK-NEXT: .wgp_mode: false ; CHECK-NEXT: .writes_depth: 0 ; CHECK-NEXT: .writes_uavs: false -; CHECK-NEXT: .vs: -; CHECK-NEXT: .checksum_value: 0xba71f629 -; CHECK-NEXT: .debug_mode: false -; CHECK-NEXT: .entry_point: _amdgpu_vs_main -; CHECK-NEXT: .float_mode: 0xc0 -; CHECK-NEXT: .ieee_mode: false -; CHECK: .mem_ordered: true -; CHECK-NEXT: .scratch_en: false -; CHECK-NEXT: .scratch_memory_size: 0 -; CHECK-NEXT: .sgpr_count: 0x3 -; CHECK-NEXT: .sgpr_limit: 0x6a -; CHECK-NEXT: .trap_present: 0 -; CHECK-NEXT: .user_data_reg_map: -; CHECK-NEXT: - 0x10000000 -; CHECK-NEXT: - 0x10000003 -; CHECK-NEXT: - 0x10000004 -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: - 0xffffffff -; CHECK-NEXT: .user_sgprs: 0x3 -; CHECK-NEXT: .vgpr_count: 0x4 -; CHECK-NEXT: .vgpr_limit: 0x100 -; CHECK-NEXT: .wavefront_size: 0x20 -; CHECK-NEXT: .wgp_mode: false -; CHECK-NEXT: .internal_pipeline_hash: -; CHECK-NEXT: - 0x{{[0-9a-f]+}} -; CHECK-NEXT: - 0x{{[0-9a-f]+}} -; CHECK-NEXT: .num_interpolants: 0x1 -; CHECK-NEXT: .registers: {} -; CHECK-NEXT: .shaders: -; CHECK-NEXT: .pixel: -; CHECK-NEXT: .api_shader_hash: -; CHECK-NEXT: - 0 -; CHECK-NEXT: - 0 -; CHECK-NEXT: .hardware_mapping: -; CHECK-NEXT: - .ps -; CHECK-NEXT: .vertex: -; CHECK-NEXT: .api_shader_hash: -; CHECK-NEXT: - 0x{{[0-9a-f]+}} -; CHECK-NEXT: - 0 -; CHECK-NEXT: .hardware_mapping: -; CHECK-NEXT: - .vs -; CHECK-NEXT: .spill_threshold: 0xffff -; CHECK-NEXT: .streamout_vertex_strides: -; CHECK-NEXT: - 0 -; CHECK-NEXT: - 0 -; CHECK-NEXT: - 0 -; CHECK-NEXT: - 0 -; CHECK-NEXT: .type: VsPs -; CHECK-NEXT: .user_data_limit: 0x1 -; CHECK-NEXT: .xgl_cache_info: -; CHECK-NEXT: .128_bit_cache_hash: -; CHECK-NEXT: - 0x{{[0-9a-f]+}} -; CHECK-NEXT: - 0x{{[0-9a-f]+}} -; CHECK-NEXT: .llpc_version: {{.*}} -; CHECK-NEXT: amdpal.version: -; CHECK-NEXT: - 0x3 -; CHECK-NEXT: - 0 -; CHECK-NEXT: ... +; diff --git a/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm b/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm index a4c8ce1f79..804a467c3f 100644 --- a/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm +++ b/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm @@ -4,7 +4,6 @@ ; BEGIN_SHADERTEST ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; REQUIRES: do-not-run-me ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results ; SHADERTEST: @[[LDS0:[^ ]*]] = addrspace(3) global <{ [8 x i32] }> poison, align 4 ; SHADERTEST: @[[LDS1:[^ ]*]] = addrspace(3) global <{ [4 x i32] }> poison, align 4 @@ -14,9 +13,9 @@ ; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}([4 x i32], ptr addrspace(3) @[[LDS1]], i32 0, i32 2), align 4 ; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}([4 x i32], ptr addrspace(3) @[[LDS1]], i32 0, i32 3), align 4 ; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1), align 4 -; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1, i32 1), align 4 -; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1, i32 2), align 4 -; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1, i32 3), align 4 +; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1), i32 0, i32 1), align 4 +; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1), i32 0, i32 2), align 4 +; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1), i32 0, i32 3), align 4 ; SHADERTEST: load i32, ptr addrspace(3) @[[LDS0]], align 4 ; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}([8 x i32], ptr addrspace(3) @[[LDS0]], i32 0, i32 1), align 4 ; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}([8 x i32], ptr addrspace(3) @[[LDS0]], i32 0, i32 2), align 4 diff --git a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe index 4962809bbb..b68f5d6674 100644 --- a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe +++ b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe @@ -187,144 +187,4 @@ entryPoint = main ; SHADERTEST-NEXT: .vs_so_base2_en: false ; SHADERTEST-NEXT: .vs_so_base3_en: false ; SHADERTEST-NEXT: .vs_streamout_en: false -; SHADERTEST-NEXT: .hardware_stages: -; SHADERTEST-NEXT: .ps: -; SHADERTEST-NEXT: .checksum_value: 0x4658ef51 -; SHADERTEST-NEXT: .debug_mode: false -; SHADERTEST-NEXT: .entry_point: _amdgpu_ps_main -; SHADERTEST-NEXT: .float_mode: 0xc0 -; SHADERTEST-NEXT: .ieee_mode: false -; SHADERTEST-NEXT: .lds_size: 0 -; SHADERTEST-NEXT: .mem_ordered: true -; SHADERTEST-NEXT: .scratch_en: false -; SHADERTEST-NEXT: .scratch_memory_size: 0 -; SHADERTEST-NEXT: .sgpr_count: 0x2 -; SHADERTEST-NEXT: .sgpr_limit: 0x6a -; SHADERTEST-NEXT: .trap_present: 0 -; SHADERTEST-NEXT: .user_data_reg_map: -; SHADERTEST-NEXT: - 0x10000000 -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: .user_sgprs: 0x1 -; SHADERTEST-NEXT: .uses_uavs: false -; SHADERTEST-NEXT: .vgpr_count: 0x2 -; SHADERTEST-NEXT: .vgpr_limit: 0x100 -; SHADERTEST-NEXT: .wavefront_size: 0x40 -; SHADERTEST-NEXT: .wgp_mode: false -; SHADERTEST-NEXT: .writes_depth: 0 -; SHADERTEST-NEXT: .writes_uavs: false -; SHADERTEST-NEXT: .vs: -; SHADERTEST-NEXT: .checksum_value: 0xd2536693 -; SHADERTEST-NEXT: .debug_mode: false -; SHADERTEST-NEXT: .entry_point: _amdgpu_vs_main -; SHADERTEST-NEXT: .float_mode: 0xc0 -; SHADERTEST-NEXT: .ieee_mode: false -; SHADERTEST-NEXT: .lds_size: 0 -; SHADERTEST-NEXT: .mem_ordered: true -; SHADERTEST-NEXT: .scratch_en: false -; SHADERTEST-NEXT: .scratch_memory_size: 0 -; SHADERTEST-NEXT: .sgpr_count: 0x3 -; SHADERTEST-NEXT: .sgpr_limit: 0x6a -; SHADERTEST-NEXT: .trap_present: 0 -; SHADERTEST-NEXT: .user_data_reg_map: -; SHADERTEST-NEXT: - 0x10000000 -; SHADERTEST-NEXT: - 0x10000003 -; SHADERTEST-NEXT: - 0x10000004 -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: - 0xffffffff -; SHADERTEST-NEXT: .user_sgprs: 0x3 -; SHADERTEST-NEXT: .vgpr_count: 0x4 -; SHADERTEST-NEXT: .vgpr_limit: 0x100 -; SHADERTEST-NEXT: .wavefront_size: 0x20 -; SHADERTEST-NEXT: .wgp_mode: false -; SHADERTEST-NEXT: .internal_pipeline_hash: -; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} -; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} -; SHADERTEST-NEXT: .num_interpolants: 0x1 -; SHADERTEST-NEXT: .registers: {} -; SHADERTEST-NEXT: .shaders: -; SHADERTEST-NEXT: .pixel: -; SHADERTEST-NEXT: .api_shader_hash: -; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} -; SHADERTEST-NEXT: - 0 -; SHADERTEST-NEXT: .hardware_mapping: -; SHADERTEST-NEXT: - .ps -; SHADERTEST-NEXT: .vertex: -; SHADERTEST-NEXT: .api_shader_hash: -; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} -; SHADERTEST-NEXT: - 0 -; SHADERTEST-NEXT: .hardware_mapping: -; SHADERTEST-NEXT: - .vs -; SHADERTEST-NEXT: .spill_threshold: 0xffff -; SHADERTEST-NEXT: .streamout_vertex_strides: -; SHADERTEST-NEXT: - 0 -; SHADERTEST-NEXT: - 0 -; SHADERTEST-NEXT: - 0 -; SHADERTEST-NEXT: - 0 -; SHADERTEST-NEXT: .type: VsPs -; SHADERTEST-NEXT: .user_data_limit: 0x1 -; SHADERTEST-NEXT: .xgl_cache_info: -; SHADERTEST-NEXT: .128_bit_cache_hash: -; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} -; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} -; SHADERTEST-NEXT: .llpc_version: {{.*}} -; SHADERTEST-NEXT: amdpal.version: -; SHADERTEST-NEXT: - 0x3 -; SHADERTEST-NEXT: - 0 -; SHADERTEST-NEXT: ... +; diff --git a/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe b/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe index 8c25a3d7c9..25d6e37870 100644 --- a/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe +++ b/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe @@ -1,12 +1,11 @@ ; Test to check that the optimization of tessellation factors store are handled as expected -; REQUIRES: do-not-run-me ; RUN: amdllpc %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST-LABEL: @_amdgpu_hs_main( ; SHADERTEST-LABEL: .distribHsPatchCount: ; SHADERTEST-NEXT: %[[HS_PATCH_COUNT_SHIFT:[^ ,]*]] = lshr i32 %mergeWaveInfo, 16 ; SHADERTEST-NEXT: %[[HS_PATCH_COUNT:[^ ,]*]] = and i32 %[[HS_PATCH_COUNT_SHIFT]], 255 -; SHADERTEST-NEXT: store i32 %[[HS_PATCH_COUNT]], ptr addrspace(3) getelementptr inbounds ([649 x i32], ptr addrspace(3) @Lds.HS, i32 0, i32 640), align 4 +; SHADERTEST-NEXT: store i32 %[[HS_PATCH_COUNT]], ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @Lds.HS, i32 2560), align 4 ; SHADERTEST-NEXT: br label %.endDistribHsPatchCount ; SHADERTEST-LABEL: .endDistribHsPatchCount: @@ -17,18 +16,18 @@ ; SHADERTEST-NEXT: br i1 %validHsVert, label %.beginHs, label %.endHs ; SHADERTEST-LABEL: .endHs: -; SHADERTEST: %[[HS_PATCH_COUNT:[^ ,]*]] = load i32, ptr addrspace(3) getelementptr inbounds ([649 x i32], ptr addrspace(3) @Lds.HS, i32 0, i32 640), align 4 -; SHADERTEST: %hsPatchCount = call i32 @llvm.amdgcn.readfirstlane(i32 %[[HS_PATCH_COUNT]]) +; SHADERTEST: %[[HS_PATCH_COUNT:[^ ,]*]] = load i32, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @Lds.HS, i32 2560), align 4 +; SHADERTEST: %hsPatchCount = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %[[HS_PATCH_COUNT]]) ; SHADERTEST: %validHsPatch = icmp ult i32 %threadIdInGroup, %hsPatchCount ; SHADERTEST: br i1 %validHsPatch, label %.checkSpecialTfInWave, label %.endCheckSpecialTfInWave ; SHADERTEST-LABEL: .checkSpecialTfInWave: -; SHADERTEST-NEXT: %[[OUTER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 6 -; SHADERTEST-NEXT: %[[OUTER_TF_I_PTR:[^ ,]*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 %[[OUTER_TF_OFFSET_0]] +; SHADERTEST-NEXT: %[[OUTER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 24 +; SHADERTEST-NEXT: %[[OUTER_TF_I_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 %[[OUTER_TF_OFFSET_0]] ; SHADERTEST-NEXT: %[[OUTER_TF_PTR:[^ ,]*]] = getelementptr {{(i8|i32)}}, ptr addrspace(3) %[[OUTER_TF_I_PTR]], i32 {{(256|1024)}} ; SHADERTEST-NEXT: %[[OUTER_TF:[^ ,]*]] = load <4 x float>, ptr addrspace(3) %[[OUTER_TF_PTR]], align 4 -; SHADERTEST-NEXT: %[[INNER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 6 -; SHADERTEST-NEXT: %[[INNER_TF_I_PTR:[^ ,]*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 %[[INNER_TF_OFFSET_0]] +; SHADERTEST-NEXT: %[[INNER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 24 +; SHADERTEST-NEXT: %[[INNER_TF_I_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 %[[INNER_TF_OFFSET_0]] ; SHADERTEST-NEXT: %[[INNER_TF_PTR:[^ ,]*]] = getelementptr {{(i8|i32)}}, ptr addrspace(3) %[[INNER_TF_I_PTR]], i32 {{(260|1040)}} ; SHADERTEST-NEXT: %[[INNER_TF:[^ ,]*]] = load <2 x float>, ptr addrspace(3) %[[INNER_TF_PTR]], align 4 ; SHADERTEST-NEXT: %[[OUTER_TF_0:[^ ,]*]] = extractelement <4 x float> %[[OUTER_TF]], i64 0 diff --git a/llpc/test/shaderdb/gfx11/TestGdsOperationsForXfb.vert b/llpc/test/shaderdb/gfx11/TestGdsOperationsForXfb.vert new file mode 100644 index 0000000000..d1d3d65d68 --- /dev/null +++ b/llpc/test/shaderdb/gfx11/TestGdsOperationsForXfb.vert @@ -0,0 +1,33 @@ +// Test to check GDS operations that are required to support GFX11 transform feedback. Also, check +// ds_ordered_count is followed by s_waitcnt lgkmcnt(0), which is required by HW on GFX11. + +// RUN: amdllpc %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s + +// SHADERTEST-LABEL: {{^// LLPC}} final pipeline module info +// SHADERTEST: .prepareXfb: +// SHADERTEST: [[orderedWaveId0:%.*]] = inttoptr i32 %orderedWaveId to ptr addrspace(2) +// SHADERTEST-NEXT: call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) [[orderedWaveId0]], i32 0, i32 0, i32 0, i1 false, i32 16777216, i1 false, i1 false) +// SHADERTEST: call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %{{.*}}, i32 0) +// SHADERTEST-NEXT: call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 0, i32 4) +// SHADERTEST: [[orderedWaveId1:%.*]] = inttoptr i32 %orderedWaveId to ptr addrspace(2) +// SHADERTEST-NEXT: call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) [[orderedWaveId1]], i32 %{{.*}}, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true) + +// SHADERTEST-LABEL: {{^// LLPC}} final ELF info +// SHADERTEST: ds_ordered_count {{v[0-9]*}}, {{v[0-9]*}} gds +// SHADERTEST: s_waitcnt lgkmcnt(0) +// SHADERTEST: ds_add_gs_reg_rtn {{v[[0-9]*:[0-9]*]}}, {{v[0-9]*}} gds ; D9EA0000 03000300 +// SHADERTEST: s_waitcnt lgkmcnt(0) ; BF89FC07 +// SHADERTEST: ds_add_gs_reg_rtn {{v[[0-9]*:[0-9]*]}}, {{v[0-9]*}} offset:4 gds ; D9EA0004 04000600 +// SHADERTEST: s_waitcnt lgkmcnt(0) +// SHADERTEST: ds_ordered_count {{v[0-9]*}}, {{v[0-9]*}} offset:772 gds +// SHADERTEST: s_waitcnt lgkmcnt(0) + +#version 450 core + +layout(location = 0, xfb_buffer = 0, xfb_offset = 0, xfb_stride = 16) out vec4 data0; +layout(location = 1, xfb_buffer = 1, xfb_offset = 0, xfb_stride = 16) out vec4 data1; + +void main() { + data0 = vec4(0.0); + data1 = vec4(1.0); +} diff --git a/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe b/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe index f044fff685..04025d52ee 100644 --- a/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe +++ b/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe @@ -2,7 +2,7 @@ ; qualifier. In a location, only part of its components are exported to XFB buffer and they are not ; consecutive. -; RUN: amdllpc -gfxip=11 -stop-after=lgc-patch-copy-shader -v %s | FileCheck -check-prefix=CHECK %s +; RUN: amdllpc -gfxip=11 -stop-after=lgc-generate-copy-shader -v %s | FileCheck -check-prefix=CHECK %s ; CHECK-LABEL: @lgc.shader.COPY.main( ; CHECK: [[TMP1:%.*]] = call float @lgc.ngg.read.GS.output.f32(i32 0, i32 0, i32 0) diff --git a/llpc/test/shaderdb/object/ObjOutput_TestGsBuiltIn_lit.geom b/llpc/test/shaderdb/object/ObjOutput_TestGsBuiltIn_lit.geom index 92590afa53..a1959278ed 100644 --- a/llpc/test/shaderdb/object/ObjOutput_TestGsBuiltIn_lit.geom +++ b/llpc/test/shaderdb/object/ObjOutput_TestGsBuiltIn_lit.geom @@ -29,13 +29,13 @@ void main() ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results +; SHADERTEST: call void @lgc.output.export.builtin.PrimitiveId{{.*}} +; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}} +; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}} ; SHADERTEST: call void @lgc.output.export.builtin.Position{{.*}}.v4f32 ; SHADERTEST: call void @lgc.output.export.builtin.PointSize{{.*}}f32 ; SHADERTEST: call void @lgc.output.export.builtin.ClipDistance{{.*}}a3f32 ; SHADERTEST: call void @lgc.output.export.builtin.CullDistance{{.*}}a2f32 -; SHADERTEST: call void @lgc.output.export.builtin.PrimitiveId{{.*}} -; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}} -; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}} ; SHADERTEST: AMDLLPC SUCCESS */ // END_SHADERTEST diff --git a/llpc/test/shaderdb/object/ObjOutput_TestTesBuiltIn_lit.tese b/llpc/test/shaderdb/object/ObjOutput_TestTesBuiltIn_lit.tese index 8834c49826..5d08ec5f9e 100644 --- a/llpc/test/shaderdb/object/ObjOutput_TestTesBuiltIn_lit.tese +++ b/llpc/test/shaderdb/object/ObjOutput_TestTesBuiltIn_lit.tese @@ -21,12 +21,12 @@ void main() ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results +; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}} +; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}} ; SHADERTEST: call void @lgc.output.export.builtin.Position{{.*}}v4f32 ; SHADERTEST: call void @lgc.output.export.builtin.PointSize{{.*}}f32 ; SHADERTEST: call void @lgc.output.export.builtin.ClipDistance{{.*}}a3f32 ; SHADERTEST: call void @lgc.output.export.builtin.CullDistance{{.*}}a4f32 -; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}} -; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}} ; SHADERTEST: AMDLLPC SUCCESS */ // END_SHADERTEST diff --git a/llpc/test/shaderdb/object/ObjOutput_TestVsBuiltIn_lit.vert b/llpc/test/shaderdb/object/ObjOutput_TestVsBuiltIn_lit.vert index 160c114043..f2f8d2eaca 100644 --- a/llpc/test/shaderdb/object/ObjOutput_TestVsBuiltIn_lit.vert +++ b/llpc/test/shaderdb/object/ObjOutput_TestVsBuiltIn_lit.vert @@ -17,12 +17,12 @@ void main() ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results +; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}} +; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}} ; SHADERTEST: call void @lgc.output.export.builtin.Position{{.*}}v4f32 ; SHADERTEST: call void @lgc.output.export.builtin.PointSize{{.*}}f32 ; SHADERTEST: call void @lgc.output.export.builtin.ClipDistance{{.*}}a4f32 ; SHADERTEST: call void @lgc.output.export.builtin.CullDistance{{.*}}a2f32 -; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}} -; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}} ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results ; SHADERTEST: call void @llvm.amdgcn.exp.f32 ; SHADERTEST: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe index bdf98525d7..fbe7b5a731 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe @@ -26,7 +26,7 @@ entryPoint = main [FsGlsl] #version 450 core -layout(set = 0, binding = 0) uniform sampler2DMS samp; +layout(set = 0, binding = 2) uniform sampler2DMS samp; layout(location = 0) in vec2 inUV; layout(location = 0) out vec4 oColor; @@ -62,7 +62,7 @@ userDataNode[0].next[1].type = DescriptorFmask userDataNode[0].next[1].offsetInDwords = 12 userDataNode[0].next[1].sizeInDwords = 8 userDataNode[0].next[1].set = 0 -userDataNode[0].next[1].binding = 0 +userDataNode[0].next[1].binding = 2 userDataNode[1].type = IndirectUserDataVaPtr userDataNode[1].offsetInDwords = 12 userDataNode[1].sizeInDwords = 1 diff --git a/llpc/tool/amdllpc.cpp b/llpc/tool/amdllpc.cpp index ce4f0e3a4f..b80cfa591c 100644 --- a/llpc/tool/amdllpc.cpp +++ b/llpc/tool/amdllpc.cpp @@ -370,6 +370,7 @@ extern opt EnablePipelineDump; extern opt PipelineDumpDir; extern opt EnableTimerProfile; extern opt BuildShaderCache; +extern OptionCategory AmdCategory; } // namespace cl } // namespace llvm @@ -412,10 +413,12 @@ CapabilityPrinter CapPrinterInstance; ExtensionPrinter ExtPrinterInstance; cl::opt> CapPrinter{"cap", cl::desc("Display the supported Capabilities."), - cl::location(CapPrinterInstance), cl::ValueDisallowed}; + cl::location(CapPrinterInstance), cl::ValueDisallowed, + cl::cat(cl::AmdCategory)}; cl::opt> ExtPrinter{"ext", cl::desc("Display the supported extensions."), - cl::location(ExtPrinterInstance), cl::ValueDisallowed}; + cl::location(ExtPrinterInstance), cl::ValueDisallowed, + cl::cat(cl::AmdCategory)}; } // namespace // ===================================================================================================================== diff --git a/llpc/tool/llpcCompilationUtils.cpp b/llpc/tool/llpcCompilationUtils.cpp index ae6dbf4a6e..06c49d5422 100644 --- a/llpc/tool/llpcCompilationUtils.cpp +++ b/llpc/tool/llpcCompilationUtils.cpp @@ -58,12 +58,12 @@ #endif #include "llpcCompilationUtils.h" +#include "LoweringUtil.h" #include "llpcAutoLayout.h" #include "llpcDebug.h" #include "llpcError.h" #include "llpcInputUtils.h" #include "llpcShaderModuleHelper.h" -#include "llpcSpirvLowerUtil.h" #include "llpcThreading.h" #include "llpcUtil.h" #ifndef LLPC_DISABLE_SPVGEN @@ -404,9 +404,8 @@ Error processInputPipeline(ICompiler *compiler, CompileInfo &compileInfo, const for (auto &libFileName : pipelineState->graphicsLibFileName) { if (!libFileName.empty()) { LLPC_OUTS(libFileName + "\n"); - auto inputSpecOrErr = parseInputFileSpec(libFileName); - assert(!inputSpecOrErr.takeError()); - compileInfo.inputSpecs.push_back(std::move(*inputSpecOrErr)); + InputSpec inputSpec = cantFail(parseInputFileSpec(libFileName)); + compileInfo.inputSpecs.push_back(std::move(inputSpec)); } } return Error::success(); diff --git a/llpc/tool/llpcShaderCache.h b/llpc/tool/llpcShaderCache.h index fa5ffd126f..dcbe3b6c26 100644 --- a/llpc/tool/llpcShaderCache.h +++ b/llpc/tool/llpcShaderCache.h @@ -173,7 +173,7 @@ class IShaderCache { IShaderCache() {} /// @internal Destructor. Prevent use of delete operator on this interface. - virtual ~IShaderCache() {} + virtual ~IShaderCache() = default; }; #endif diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.cpp b/llpc/translator/lib/SPIRV/SPIRVReader.cpp index e1b7cd8455..ac0b198805 100644 --- a/llpc/translator/lib/SPIRV/SPIRVReader.cpp +++ b/llpc/translator/lib/SPIRV/SPIRVReader.cpp @@ -4902,6 +4902,8 @@ Value *SPIRVToLLVM::transVariableNonImage(SPIRVValue *const spvValue) { Type *const ptrType = transType(spvVar->getType()); unsigned addrSpace = ptrType->getPointerAddressSpace(); + auto llpcContext = static_cast(m_context); + auto buildInfo = static_cast(llpcContext->getPipelineBuildInfo()); Type *const varType = transType(spvVarType, 0, true, layout); @@ -4927,9 +4929,12 @@ Value *SPIRVToLLVM::transVariableNonImage(SPIRVValue *const spvValue) { } } if (!isBuiltIn) { - // Initializize user-defined output variable to zero + // Initialize user-defined output variable to zero initializer = Constant::getNullValue(varType); } + } else if (buildInfo->enableInitUndefZero && (storageClass == SPIRVStorageClassKind::StorageClassPrivate || + storageClass == SPIRVStorageClassKind::StorageClassFunction)) { + initializer = Constant::getNullValue(varType); } bool readOnly = false; @@ -5262,15 +5267,15 @@ lgc::CooperativeMatrixElementType SPIRVToLLVM::mapToBasicType(SPIRVType *const e lgc::CooperativeMatrixLayout SPIRVToLLVM::getLayout(lgc::CooperativeMatrixElementType elemType) { const Vkgc::GfxIpVersion gfxIp = getPipelineContext()->getGfxIpVersion(); - if (elemType == lgc::CooperativeMatrixElementType::Int32 || elemType == lgc::CooperativeMatrixElementType::Float32) { + + if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 32)) { if (gfxIp.major == 11) return lgc::CooperativeMatrixLayout::AccumulatorMatrixLayout; return lgc::CooperativeMatrixLayout::Gfx10AccumulatorMatrixLayout; } - if (elemType == lgc::CooperativeMatrixElementType::Int16 || elemType == lgc::CooperativeMatrixElementType::Int8 || - elemType == lgc::CooperativeMatrixElementType::Float16) { + if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 16) || BuilderCommon::isTypeNCooperativeMatrix(elemType, 8)) return lgc::CooperativeMatrixLayout::FactorMatrixLayout; - } + llvm_unreachable("The element type is not supported!"); return lgc::CooperativeMatrixLayout::InvalidLayout; } @@ -5292,7 +5297,7 @@ lgc::CooperativeMatrixLayout SPIRVToLLVM::getCooperativeMatrixKHRLayout(Cooperat if (use == CooperativeMatrixUse::CooperativeMatrixUseMatrixAccumulatorKHR) { if (gfxIp.major == 11) return lgc::CooperativeMatrixLayout::AccumulatorMatrixLayout; - if (elemType == lgc::CooperativeMatrixElementType::Float32 || elemType == lgc::CooperativeMatrixElementType::Int32) + if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 32)) return lgc::CooperativeMatrixLayout::Gfx10AccumulatorMatrixLayout; if (elemType == lgc::CooperativeMatrixElementType::Int16 || elemType == lgc::CooperativeMatrixElementType::Float16) return lgc::CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout; @@ -5526,9 +5531,10 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode(static_cast(spvInst)->getMatrixBSigned()); bool isSat = static_cast(static_cast(spvInst)->getMatrixSatAccumulation()); - Value *coopMatrixD = getBuilder()->create(coopMatrixC->getType(), coopMatrixA, coopMatrixB, - coopMatrixC, isSignedA, isSignedB, isSat, 0, - elemBasicTypeC, elemBasicTypeA, "mulAdd"); + // Current SPIRV does not supported fp8 or bf8 yet, so the types of A and B use the same value. + Value *coopMatrixD = getBuilder()->create( + coopMatrixC->getType(), coopMatrixA, coopMatrixB, coopMatrixC, isSignedA, isSignedB, isSat, 0, elemBasicTypeA, + elemBasicTypeA, elemBasicTypeC, "mulAdd"); return coopMatrixD; } diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEntry.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEntry.h index dbb07263b4..b673630d9c 100644 --- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEntry.h +++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEntry.h @@ -185,7 +185,7 @@ class SPIRVEntry { SPIRVEntry() : Module(NULL), OpCode(OpNop), Id(SPIRVID_INVALID), Attrib(SPIRVEA_DEFAULT), WordCount(0), Line(nullptr) {} - virtual ~SPIRVEntry() {} + virtual ~SPIRVEntry() = default; bool exist(SPIRVId) const; template T *get(SPIRVId TheId) const { return reinterpret_cast(getEntry(TheId)); } diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp index df8d706f6e..eeff4e8256 100644 --- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp +++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp @@ -55,9 +55,6 @@ namespace SPIRV { SPIRVModule::SPIRVModule() : AutoAddCapability(true), ValidateCapability(false) { } -SPIRVModule::~SPIRVModule() { -} - class SPIRVModuleImpl : public SPIRVModule { public: SPIRVModuleImpl() diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h index f1757ee767..3f3a3c6835 100644 --- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h +++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h @@ -88,7 +88,7 @@ class SPIRVModule { static SPIRVModule *createSPIRVModule(); SPIRVModule(); - virtual ~SPIRVModule(); + virtual ~SPIRVModule() = default; // Object query functions virtual bool exist(SPIRVId) const = 0; diff --git a/llvmraytracing/CMakeLists.txt b/llvmraytracing/CMakeLists.txt index cdc578e061..bc041f02bf 100644 --- a/llvmraytracing/CMakeLists.txt +++ b/llvmraytracing/CMakeLists.txt @@ -35,6 +35,7 @@ add_llvm_library(LLVMRaytracing lib/LowerRayQuery.cpp lib/LowerRaytracingPipeline.cpp lib/PassRegistry.inc + lib/PipelineState.cpp lib/PayloadAccessQualifiers.cpp lib/RemoveTypesMetadata.cpp @@ -43,6 +44,7 @@ add_llvm_library(LLVMRaytracing LINK_COMPONENTS Analysis + BinaryFormat Core Coroutines IPO @@ -107,8 +109,3 @@ if(LLVMRAYTRACING_BUILD_TESTS) add_custom_target(check-continuations DEPENDS check-llvmraytracing) add_custom_target(check-continuations-units DEPENDS check-llvmraytracing-units) endif() - -# Temporary alias -- to be removed when Vulkan CI and DXCP have been updated. -if (LLPC_RAYTRACING_ADD_TRANSITION_TARGETS) - add_library(LLVMContinuations ALIAS LLVMRaytracing) -endif() diff --git a/llvmraytracing/include/continuations/Continuations.h b/llvmraytracing/include/continuations/Continuations.h deleted file mode 100644 index 1e137767d8..0000000000 --- a/llvmraytracing/include/continuations/Continuations.h +++ /dev/null @@ -1,2 +0,0 @@ -// Transition header -- to be removed -#include "llvmraytracing/Continuations.h" diff --git a/llvmraytracing/include/continuations/ContinuationsUtil.h b/llvmraytracing/include/continuations/ContinuationsUtil.h deleted file mode 100644 index c346b5f5ae..0000000000 --- a/llvmraytracing/include/continuations/ContinuationsUtil.h +++ /dev/null @@ -1,2 +0,0 @@ -// Transition header -- to be removed -#include "llvmraytracing/ContinuationsUtil.h" diff --git a/llvmraytracing/include/llvmraytracing/Continuations.h b/llvmraytracing/include/llvmraytracing/Continuations.h index d4f3733991..3b94ad0c90 100644 --- a/llvmraytracing/include/llvmraytracing/Continuations.h +++ b/llvmraytracing/include/llvmraytracing/Continuations.h @@ -161,31 +161,6 @@ uint64_t computePayloadSpillSize(uint64_t NumI32s, uint64_t NumReservedRegisters // of individual bytes at the end if NumBytes is not a multiple of 4. void copyBytes(IRBuilder<> &B, Value *Dst, Value *Src, uint64_t NumBytes); -class DialectContextAnalysisResult { -public: - DialectContextAnalysisResult() {} - - bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &, llvm::ModuleAnalysisManager::Invalidator &) { - return false; - } -}; - -/// An analysis to run with dialects, even if the running tool does not have -/// explicit support for it. This will create a dialect context on-demand. -class DialectContextAnalysis : public llvm::AnalysisInfoMixin { -public: - using Result = DialectContextAnalysisResult; - DialectContextAnalysis(bool NeedDialectContext = true); - Result run(llvm::Module &module, llvm::ModuleAnalysisManager &); - static llvm::AnalysisKey Key; - -private: - std::unique_ptr Context; - // If true, this analysis is responsible to create a dialect context. - // If false, a context is already created outside of the pass pipeline. - bool NeedDialectContext; -}; - class LegacyCleanupContinuationsPass : public llvm::PassInfoMixin { public: LegacyCleanupContinuationsPass() {} @@ -203,37 +178,7 @@ class CleanupContinuationsPass : public llvm::PassInfoMixin Functions; - /// Size of the continuation state in byte - uint32_t ContStateBytes = 0; - CallInst *MallocCall = nullptr; - MDNode *MD = nullptr; - SmallVector NewFunctions; - }; - - void removeContFreeCall(Function *F, Function *ContFree); - Value *getContinuationFramePtr(Function *F, bool IsStart, const ContinuationData &ContinuationInfo, - SmallVector *InstsToRemove = nullptr); - void freeCpsStack(Function *F, ContinuationData &CpsInfo); - void updateCpsStack(Function *F, Function *NewFunc, bool IsStart, ContinuationData &CpsInfo); - void analyzeContinuation(Function &F, MDNode *MD); - void processContinuations(); - void handleContinue(ContinuationData &Data, Instruction *Ret); - void handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun); - void lowerIntrinsicCall(Module &Mod); - void lowerGetResumePoint(Module &Mod); - - llvm_dialects::Builder *Builder = nullptr; - Function *ContMalloc = nullptr; - Function *ContFree = nullptr; - MapVector ToProcess; - uint32_t MaxContStateBytes; - llvm::Module *GpurtLibrary = nullptr; bool Use64BitContinuationReferences; - llvm::Type *ContinuationReferenceType = nullptr; }; // Define a wrapper pass that is used for CleanupContinuationsPass creating diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h index e05a875c9d..f134257833 100644 --- a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h +++ b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h @@ -125,6 +125,31 @@ struct ContSetting { uint64_t Value; }; +class DialectContextAnalysisResult { +public: + DialectContextAnalysisResult() {} + + bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &, llvm::ModuleAnalysisManager::Invalidator &) { + return false; + } +}; + +/// An analysis to run with dialects, even if the running tool does not have +/// explicit support for it. This will create a dialect context on-demand. +class DialectContextAnalysis : public llvm::AnalysisInfoMixin { +public: + using Result = DialectContextAnalysisResult; + DialectContextAnalysis(bool NeedDialectContext = true); + Result run(llvm::Module &Module, llvm::ModuleAnalysisManager &); + static llvm::AnalysisKey Key; + +private: + std::unique_ptr Context; + // If true, this analysis is responsible to create a dialect context. + // If false, a context is already created outside of the pass pipeline. + bool NeedDialectContext; +}; + // Helper class to access data specific to continuation passes, e.g. // metadata or globals. class ContHelper { @@ -162,32 +187,17 @@ class ContHelper { // for PAQed fields, and all other data required in a particular stage (e.g. // hit attributes). // - // [in] PreservedPayloadRegisterCount: - // The required number of preserved payload registers for functions that - // are not aware of payload types (e.g. Intersection or Traversal), if known. - // This gives an upper bound on the number of payload registers used by other - // functions together with functions in the current module. - // Setting this value can be used to reduce the number of preserved registers - // for such functions to prevent having to preserve the maximum possible - // amount of payload registers. This is used when compiling a specialized - // Traversal function for a pipeline after all shaders in the pipeline have - // been processed. - // For intersection, it is not used, because early-compiled intersection - // shaders can be used in pipelines with large payload types unknown when - // compiling the intersection shader. - static constexpr const char *MDPreservedPayloadRegisterCountName = "continuation.preservedPayloadRegisterCount"; // [in] MaxPayloadRegisterCount // The maximum allowed number of payload registers to be used for payload and // other inter-stage date (e.g. attributes). If state does not fit into this // limit, we spill to the continuation stack. static constexpr const char *MDMaxPayloadRegisterCountName = "continuation.maxPayloadRegisterCount"; - // [out] MaxUsedPayloadRegisterCount + // [in/out] MaxUsedPayloadRegisterCount // The maximum number of payload registers written or read by any - // shader in the module. This excludes intersection shaders, which + // shader in the pipeline. This excludes intersection shaders, which // just pass through an existing payload. - // This can be used to populate PreservedPayloadRegisterCount when compiling - // the driver module in case all modules of the pipeline are known and - // have already been processed. + // If this is set on a driver module, we rely on it being an upper bound on the + // number of payload registers that need to be preserved. static constexpr const char *MDMaxUsedPayloadRegisterCountName = "continuation.maxUsedPayloadRegisterCount"; // The address space used to store the continuations stack. // The possible values for this metadata are the values of ContStackAddrspace. @@ -348,7 +358,11 @@ class ContHelper { static std::optional tryGet##NAME(const Module &M) { return NAME::tryGetValue(&M); } \ static void set##NAME(Module &M, uint32_t Value) { NAME::setValue(&M, Value); } - MODULE_METADATA_HELPER(PreservedPayloadRegisterCount, MDPreservedPayloadRegisterCountName) + static std::optional tryGetPreservedPayloadRegisterCount(const Module &M) { + return tryGetMaxUsedPayloadRegisterCount(M); + } + static void setPreservedPayloadRegisterCount(Module &M, uint32_t Value) { setMaxUsedPayloadRegisterCount(M, Value); } + MODULE_METADATA_HELPER(MaxUsedPayloadRegisterCount, MDMaxUsedPayloadRegisterCountName) MODULE_METADATA_HELPER(MaxPayloadRegisterCount, MDMaxPayloadRegisterCountName) MODULE_METADATA_HELPER(Rtip, MDRtipName) @@ -356,11 +370,6 @@ class ContHelper { #undef MODULE_METADATA_HELPER - // Old alias until clients are migrated to setPreservedPayloadRegisterCount: - static void setMinPayloadRegisterCount(Module &M, uint32_t PreservedPayloadRegisterCount) { - PreservedPayloadRegisterCount::setValue(&M, PreservedPayloadRegisterCount); - } - // If there is module-level metadata specifying the stack addrspace, // return that value. Otherwise, return std::nullopt. static std::optional tryGetStackAddrspace(const Module &M) { diff --git a/llvmraytracing/include/llvmraytracing/LowerRayQuery.h b/llvmraytracing/include/llvmraytracing/LowerRayQuery.h index bbaba8793b..2b9d3ae344 100644 --- a/llvmraytracing/include/llvmraytracing/LowerRayQuery.h +++ b/llvmraytracing/include/llvmraytracing/LowerRayQuery.h @@ -46,6 +46,7 @@ class GpurtGetStaticFlagsOp; class GpurtStackReadOp; class GpurtStackWriteOp; class GpurtLdsStackInitOp; +class GpurtGetRayStaticIdOp; namespace rtq { class InitializeOp; @@ -151,6 +152,7 @@ class LowerRayQuery : public llvm::PassInfoMixin { void visitStackReadOp(lgc::GpurtStackReadOp &inst); void visitStackWriteOp(lgc::GpurtStackWriteOp &inst); void visitLdsStackInitOp(lgc::GpurtLdsStackInitOp &inst); + void visitGetRayStaticIdOp(lgc::GpurtGetRayStaticIdOp &inst); void visitHitAccessor(GpurtFunc instType, llvm::Value *rayQuery, bool committed, llvm::CallBase *inst); void visitAccessor(GpurtFunc instType, llvm::Value *rayQuery, llvm::CallBase *inst); @@ -164,10 +166,12 @@ class LowerRayQuery : public llvm::PassInfoMixin { llvm::Module *m_gpurtModule = nullptr; llvm::Function **m_gpurtFuncs = nullptr; llvm::SmallVector m_rtqAlloc; + llvm::SmallVector m_callsToLower; llvm::SmallSet m_funcsToLower; llvm_dialects::Builder *m_builder = nullptr; CompilerUtils::TypeLowering *m_typeLowering = nullptr; llvm::Type *m_rtqType = nullptr; + unsigned m_traceRayId = 0; }; } // namespace rt diff --git a/llvmraytracing/include/llvmraytracing/PipelineState.h b/llvmraytracing/include/llvmraytracing/PipelineState.h new file mode 100644 index 0000000000..f00f335ac3 --- /dev/null +++ b/llvmraytracing/include/llvmraytracing/PipelineState.h @@ -0,0 +1,93 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file PipelineState.h + * @brief Declaration of pipeline state owned by llvmraytracing + * + * Some optimizations implemented in llvmraytracing depend on cross-module state. + * For instance, when compiling the Traversal shader, we need an upper bound on the payload size. + * + * This class keeps track of any such state that is owned my llvmraytracing, meaning it is produced + * and consumed by llvmraytracing passes, and it can be changed without pipeline compiler (e.g. LLPC) changes. + * + * It supports importing/exporting from/to module metadata, merging with other pipeline states, and + * serialization/deserialization to binary blobs via MsgPack. + * + * It is intended to be used like this by pipeline compilers (such as LLPC): + * * After processing of an app module, its pipeline state is extracted from metadata, and merged with earlier state. + * * Before compiling a module with full pipeline knowledge (e.g. when compiling the Traversal shader), the merged + * state is exported to the module. + * * After having compiled a library/pipeline that might be reused by a child pipeline, its state is serialized. + * * When reusing an early-compiled parent library/pipeline, its state is deserialized and merged into the current + * pipeline's state. + * + * The pipeline compiler is not expected to collect and merge state of early-compiled driver modules (GpuRt), + * as these are compiled independently per pipeline, and thus compilation of child pipeline driver functions shouldn't + * depend on parent pipeline driver functions. + * + *********************************************************************************************************************** + */ +#pragma once + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" + +namespace llvm { +class Module; +namespace msgpack { +class DocNode; +} // namespace msgpack +} // namespace llvm + +namespace llvmraytracing { + +class PipelineState { +public: + // Construct a new trivial pipeline state which can be used to be merged with non-trivial state later. + PipelineState() = default; + + // (De)serialization to/from MsgPack is both supported standalone, or as part of an outer MsgPack document. + static llvm::Expected decodeMsgpack(llvm::StringRef Data); + // Node is non-const because the const-correct accessors are less convenient to work with + static llvm::Expected decodeMsgpack(llvm::msgpack::DocNode &Node); + std::string encodeMsgpack() const; + void encodeMsgpack(llvm::msgpack::DocNode &Node) const; + + static llvm::Expected fromModuleMetadata(const llvm::Module &M); + void exportModuleMetadata(llvm::Module &M) const; + + void merge(const PipelineState &Other); + +private: + // Actual state is intentionally private, as this interface is intended to be used like opaque state. + // llvmraytracing passes don't use this interface, and instead directly work on module metadata. + + // The maximum occurring number of payload registers in the pipeline, which will be taken into account for Traversal + // module so that it sees the correct maximum payload size of a pipeline. + unsigned MaxUsedPayloadRegisterCount = 0; +}; + +} // namespace llvmraytracing diff --git a/llvmraytracing/lib/CleanupContinuations.cpp b/llvmraytracing/lib/CleanupContinuations.cpp index 93e1f8f66b..0a6639b278 100644 --- a/llvmraytracing/lib/CleanupContinuations.cpp +++ b/llvmraytracing/lib/CleanupContinuations.cpp @@ -75,6 +75,52 @@ using namespace lgc; #define DEBUG_TYPE "cleanup-continuations" +namespace { + +class CleanupContinuationsPassImpl { +public: + CleanupContinuationsPassImpl(llvm::Module &M, llvm::ModuleAnalysisManager &AM, + bool Use64BitContinuationReferences = false); + + PreservedAnalyses run(); + +private: + struct ContinuationData { + /// All functions belonging to this continuation, the entry function is the + /// first one + SmallVector Functions; + /// Size of the continuation state in byte + uint32_t ContStateBytes = 0; + CallInst *MallocCall = nullptr; + MDNode *MD = nullptr; + SmallVector NewFunctions; + }; + + void removeContFreeCall(Function *F, Function *ContFree); + Value *getContinuationFramePtr(Function *F, bool IsStart, const ContinuationData &ContinuationInfo, + SmallVector *InstsToRemove = nullptr); + void freeCpsStack(Function *F, ContinuationData &CpsInfo); + void updateCpsStack(Function *F, Function *NewFunc, bool IsStart, ContinuationData &CpsInfo); + void analyzeContinuation(Function &F, MDNode *MD); + void processContinuations(); + void handleContinue(ContinuationData &Data, Instruction *Ret); + void handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun); + void lowerIntrinsicCall(Module &Mod); + void lowerGetResumePoint(Module &Mod); + bool lowerCompleteOp(Module &Mod); + + llvm::Module &Mod; + llvm::ModuleAnalysisManager &AnalysisManager; + llvm_dialects::Builder *Builder = nullptr; + Function *ContMalloc = nullptr; + Function *ContFree = nullptr; + MapVector ToProcess; + uint32_t MaxContStateBytes; + llvm::Module *GpurtLibrary = nullptr; + bool Use64BitContinuationReferences; + llvm::Type *ContinuationReferenceType = nullptr; +}; + /// Find the original call that created the continuation token and the matching /// resume function for a return value. /// @@ -139,7 +185,7 @@ findTokenOrigin(BasicBlock *BB, Value *V, SmallVectorImpl &ToRemo return Result; } -void CleanupContinuationsPass::analyzeContinuation(Function &F, MDNode *MD) { +void CleanupContinuationsPassImpl::analyzeContinuation(Function &F, MDNode *MD) { // Only analyze main continuation auto *MDTup = cast(MD); auto *EntryF = mdconst::extract(MDTup->getOperand(0)); @@ -173,7 +219,8 @@ void CleanupContinuationsPass::analyzeContinuation(Function &F, MDNode *MD) { MaxContStateBytes = Data.ContStateBytes; } -void CleanupContinuationsPass::updateCpsStack(Function *F, Function *NewFunc, bool IsStart, ContinuationData &CpsInfo) { +void CleanupContinuationsPassImpl::updateCpsStack(Function *F, Function *NewFunc, bool IsStart, + ContinuationData &CpsInfo) { Builder->SetInsertPoint(&*NewFunc->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); Value *CpsStack = nullptr; @@ -247,9 +294,9 @@ static void buildCpsArgInfos(Function *F, bool IsStart, SmallVector &All /// Find the continuation state pointer, either returned by the malloc or /// given as an argument -Value *CleanupContinuationsPass::getContinuationFramePtr(Function *F, bool IsStart, - const ContinuationData &ContinuationInfo, - SmallVector *InstsToRemove) { +Value *CleanupContinuationsPassImpl::getContinuationFramePtr(Function *F, bool IsStart, + const ContinuationData &ContinuationInfo, + SmallVector *InstsToRemove) { if (!ContinuationInfo.MallocCall) return IsStart ? F->getArg(F->arg_size() - 1) : F->getArg(0); @@ -267,7 +314,7 @@ Value *CleanupContinuationsPass::getContinuationFramePtr(Function *F, bool IsSta /// Remove call to continuation.free() in F, ContFree is the pointer to /// declaration of continuation.free(). -void CleanupContinuationsPass::removeContFreeCall(Function *F, Function *ContFree) { +void CleanupContinuationsPassImpl::removeContFreeCall(Function *F, Function *ContFree) { for (auto *User : make_early_inc_range(ContFree->users())) { if (auto *Call = dyn_cast(User)) { if (Call->getFunction() == F) { @@ -279,7 +326,7 @@ void CleanupContinuationsPass::removeContFreeCall(Function *F, Function *ContFre } /// Insert cps.free() before the original function exits and lgc.cps.complete calls. -void CleanupContinuationsPass::freeCpsStack(Function *F, ContinuationData &CpsInfo) { +void CleanupContinuationsPassImpl::freeCpsStack(Function *F, ContinuationData &CpsInfo) { struct VisitState { ContinuationData &CpsInfo; llvm_dialects::Builder *Builder; @@ -288,7 +335,6 @@ void CleanupContinuationsPass::freeCpsStack(Function *F, ContinuationData &CpsIn VisitState State = {CpsInfo, Builder, F}; static const auto Visitor = llvm_dialects::VisitorBuilder() - .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) .addSet([](auto &State, auto &Instruction) { if (Instruction.getFunction() == State.F && State.CpsInfo.ContStateBytes) { State.Builder->SetInsertPoint(&Instruction); @@ -299,7 +345,31 @@ void CleanupContinuationsPass::freeCpsStack(Function *F, ContinuationData &CpsIn Visitor.visit(State, *F); } -void CleanupContinuationsPass::processContinuations() { +/// Handle lgc.cps.complete calls. +bool CleanupContinuationsPassImpl::lowerCompleteOp(Module &Mod) { + struct VisitState { + llvm_dialects::Builder *Builder; + bool completeLowered; + }; + + bool completeLowered = false; + VisitState State = {Builder, completeLowered}; + static auto Visitor = llvm_dialects::VisitorBuilder() + .add([](VisitState &State, auto &complete) { + State.Builder->SetInsertPoint(&complete); + State.Builder->CreateRetVoid(); + BasicBlock *block = complete.getParent(); + block->getTerminator()->eraseFromParent(); + complete.eraseFromParent(); + State.completeLowered = true; + }) + .build(); + + Visitor.visit(State, Mod); + return State.completeLowered; +} + +void CleanupContinuationsPassImpl::processContinuations() { // Summarize of what to do here: // 1. Continuation Stack // a.) cps.alloc() in start, and cps.peek() cps.free() in resume. @@ -362,17 +432,6 @@ void CleanupContinuationsPass::processContinuations() { auto *I = BB.getTerminator(); if (isa(I)) { handleContinue(FuncData.second, I); - } else if (I->getOpcode() == Instruction::Unreachable) { - // We should only have 'lgc.cps.complete' or 'lgc.cps.jump' calls before unreachable. - auto *Call = cast(--I->getIterator()); - if (isa(Call)) { - Builder->SetInsertPoint(Call); - Builder->CreateRetVoid(); - Call->eraseFromParent(); - I->eraseFromParent(); - } else { - assert(isa(Call)); - } } } @@ -413,7 +472,7 @@ void CleanupContinuationsPass::processContinuations() { /// i32 %cr2, ...) /// /// Also handles cases where the token and resume function are behind a phi. -void CleanupContinuationsPass::handleContinue(ContinuationData &Data, Instruction *Ret) { +void CleanupContinuationsPassImpl::handleContinue(ContinuationData &Data, Instruction *Ret) { // Find the function call that generates the token LLVM_DEBUG(dbgs() << "Converting ret to continue: " << *Ret << "\nArgument: " << *Ret->getOperand(0) << "\n"); auto *BB = Ret->getParent(); @@ -438,7 +497,7 @@ void CleanupContinuationsPass::handleContinue(ContinuationData &Data, Instructio } } -void CleanupContinuationsPass::handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun) { +void CleanupContinuationsPassImpl::handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun) { Builder->SetInsertPoint(Call); SmallVector TailArgs; @@ -478,7 +537,7 @@ void CleanupContinuationsPass::handleSingleContinue(ContinuationData &Data, Call } /// Lower lgc.rt calls inside cps functions. -void CleanupContinuationsPass::lowerIntrinsicCall(Module &Mod) { +void CleanupContinuationsPassImpl::lowerIntrinsicCall(Module &Mod) { DenseMap> CpsIntrinsicCalls; // We only care about lgc.rt here. @@ -520,7 +579,7 @@ void CleanupContinuationsPass::lowerIntrinsicCall(Module &Mod) { } } -void CleanupContinuationsPass::lowerGetResumePoint(Module &Mod) { +void CleanupContinuationsPassImpl::lowerGetResumePoint(Module &Mod) { for (auto &F : make_early_inc_range(Mod)) { auto FuncName = F.getName(); if (!FuncName.starts_with("_AmdGetResumePointAddr")) @@ -544,7 +603,12 @@ void CleanupContinuationsPass::lowerGetResumePoint(Module &Mod) { } } -llvm::PreservedAnalyses CleanupContinuationsPass::run(llvm::Module &Mod, llvm::ModuleAnalysisManager &AnalysisManager) { +CleanupContinuationsPassImpl::CleanupContinuationsPassImpl(llvm::Module &M, llvm::ModuleAnalysisManager &AM, + bool Use64BitContinuationReferences) + : Mod(M), AnalysisManager(AM), Use64BitContinuationReferences{Use64BitContinuationReferences} { +} + +llvm::PreservedAnalyses CleanupContinuationsPassImpl::run() { LLVM_DEBUG(dbgs() << "Run the lgc-cleanup-continuations pass\n"); AnalysisManager.getResult(Mod); auto &FAM = AnalysisManager.getResult(Mod).getManager(); @@ -614,13 +678,26 @@ llvm::PreservedAnalyses CleanupContinuationsPass::run(llvm::Module &Mod, llvm::M } } + bool Changed = false; if (!ToProcess.empty()) { processContinuations(); // Lower lgc.rt intrinsics lowerIntrinsicCall(Mod); lowerGetResumePoint(Mod); - return PreservedAnalyses::none(); + Changed = true; } - return PreservedAnalyses::all(); + + Changed |= lowerCompleteOp(Mod); + + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +} // namespace + +llvm::PreservedAnalyses CleanupContinuationsPass::run(llvm::Module &Mod, llvm::ModuleAnalysisManager &AnalysisManager) { + LLVM_DEBUG(dbgs() << "Run the cleanup-continuations pass\n"); + AnalysisManager.getResult(Mod); + CleanupContinuationsPassImpl Impl(Mod, AnalysisManager, Use64BitContinuationReferences); + return Impl.run(); } diff --git a/llvmraytracing/lib/Continuations.cpp b/llvmraytracing/lib/Continuations.cpp index 29c616181d..a9bb95cc3a 100644 --- a/llvmraytracing/lib/Continuations.cpp +++ b/llvmraytracing/lib/Continuations.cpp @@ -832,6 +832,12 @@ CallInst *llvm::replaceIntrinsicCall(IRBuilder<> &B, Type *SystemDataTy, Value * } } + // Tolerate Replacement returning a single-element struct containing a value of the right type. + if (!Call->getType()->isVoidTy() && Call->getType() != Replacement->getType()) { + assert(cast(Replacement->getType())->getNumElements() == 1); + Replacement = B.CreateExtractValue(Replacement, 0); + } + LLVM_DEBUG(dbgs() << "Replacing " << *Call << " by " << *NewCall << "\n"); if (!Call->getType()->isVoidTy()) Call->replaceAllUsesWith(Replacement); diff --git a/llvmraytracing/lib/LegacyCleanupContinuations.cpp b/llvmraytracing/lib/LegacyCleanupContinuations.cpp index db36d52c62..6a6c2b921b 100644 --- a/llvmraytracing/lib/LegacyCleanupContinuations.cpp +++ b/llvmraytracing/lib/LegacyCleanupContinuations.cpp @@ -41,6 +41,7 @@ #include "lgc/LgcIlCpsDialect.h" #include "lgc/LgcRtDialect.h" #include "llvm-dialects/Dialect/Builder.h" +#include "llvm-dialects/Dialect/Visitor.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Instructions.h" @@ -88,6 +89,7 @@ class LegacyCleanupContinuationsPassImpl { void handleFunctionEntry(ContinuationData &Data, Function *F, bool IsEntry); void handleContinue(ContinuationData &Data, Instruction *Ret); void handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun); + bool lowerCompleteOp(Module &M); Module &M; LLVMContext &Context; @@ -295,6 +297,26 @@ Value *getContFrame(CallInst *MallocCall, Function *F, bool IsStart, SmallVector return ContFrame; } +bool LegacyCleanupContinuationsPassImpl::lowerCompleteOp(Module &M) { + struct VisitState { + llvm_dialects::Builder &Builder; + bool completeLowered; + }; + + bool completeLowered = false; + VisitState State = {B, completeLowered}; + static auto Visitor = llvm_dialects::VisitorBuilder() + .add([](VisitState &State, auto &complete) { + State.Builder.SetInsertPoint(&complete); + llvm::terminateShader(State.Builder, &complete); + State.completeLowered = true; + }) + .build(); + + Visitor.visit(State, M); + return State.completeLowered; +} + void LegacyCleanupContinuationsPassImpl::processContinuation(Function *StartFunc, ContinuationData &FuncData) { auto *Void = Type::getVoidTy(Context); LLVM_DEBUG(dbgs() << "Processing function: " << StartFunc->getName() << "\n"); @@ -471,8 +493,6 @@ void LegacyCleanupContinuationsPassImpl::processContinuation(Function *StartFunc uint32_t NeededStackSize = FuncData.getContStateStackBytes(); if (NeededStackSize > 0) B.create(B.getInt32(NeededStackSize)); - - llvm::terminateShader(B, PrevInst); } else { LLVM_DEBUG(PrevInst->dump()); llvm_unreachable("Unexpected instruction!"); @@ -670,6 +690,8 @@ PreservedAnalyses LegacyCleanupContinuationsPassImpl::run() { fixupDxilMetadata(M); } + Changed |= lowerCompleteOp(M); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvmraytracing/lib/LgcCpsJumpInliner.cpp b/llvmraytracing/lib/LgcCpsJumpInliner.cpp index e970060385..3890117b02 100644 --- a/llvmraytracing/lib/LgcCpsJumpInliner.cpp +++ b/llvmraytracing/lib/LgcCpsJumpInliner.cpp @@ -69,16 +69,16 @@ LgcCpsJumpInlinerPassImpl::LgcCpsJumpInlinerPassImpl(Module &M, Module &GpurtLib PreservedAnalyses LgcCpsJumpInlinerPassImpl::run() { using JumpVecTy = SmallVector; - static const auto Visitor = - llvm_dialects::VisitorBuilder>() - .add([](SmallVector &AllJumps, JumpOp &Jump) { AllJumps.push_back(&Jump); }) - .build(); + static const auto Visitor = llvm_dialects::VisitorBuilder>() + .add([](JumpVecTy &AllJumps, JumpOp &Jump) { AllJumps.push_back(&Jump); }) + .build(); JumpVecTy AllJumps; // Collect lgc.cps.jump ops. Visitor.visit(AllJumps, *Mod); bool Changed = false; + DenseSet DeadFunctions; // Iterate over all collected jumps and try to inline the jump target. for (auto *Jump : AllJumps) { auto *AsCROp = dyn_cast(Jump->getTarget()); @@ -113,12 +113,16 @@ PreservedAnalyses LgcCpsJumpInlinerPassImpl::run() { AsCROp->eraseFromParent(); // There might still be other users left, if the function is not referenced as direct jump target. + // Remove function after this loop, it may contain jumps that we still want to inline. if (JumpTargetFunc->user_empty() && JumpTargetFunc->getLinkage() == GlobalValue::InternalLinkage) - JumpTargetFunc->eraseFromParent(); + DeadFunctions.insert(JumpTargetFunc); Changed = true; } + for (auto *F : DeadFunctions) + F->eraseFromParent(); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvmraytracing/lib/LowerRayQuery.cpp b/llvmraytracing/lib/LowerRayQuery.cpp index 9df3b4b6f7..f94e3cc0df 100644 --- a/llvmraytracing/lib/LowerRayQuery.cpp +++ b/llvmraytracing/lib/LowerRayQuery.cpp @@ -53,24 +53,24 @@ static const char *const GpurtFuncNames[] = { "_RayQuery_CommitNonOpaqueTriangleHit", "_RayQuery_CommitProceduralPrimitiveHit", "_RayQuery_EndInterleavedProceed", - "FetchTrianglePositionFromRayQuery", + "_RayQuery_FetchTrianglePosition", "_RayQuery_GeometryIndex", "_RayQuery_GetObjId", "_RayQuery_InstanceContributionToHitGroupIndex", "_RayQuery_InstanceID", "_RayQuery_InstanceIndex", "_RayQuery_IntersectionType", - "LongRayQueryProceedAMD", + "_RayQuery_LongProceedAMD", "_RayQuery_ObjectRayDirection", "_RayQuery_ObjectRayOrigin", "_RayQuery_ObjectToWorld4x3", "_RayQuery_PrimitiveIndex", "_RayQuery_RayFlags", - "RayQueryProceed", + "_RayQuery_Proceed", "_RayQuery_RayT", "_RayQuery_RayTMin", "_RayQuery_SetObjId", - "TraceRayInline", + "_RayQuery_TraceRayInline", "_RayQuery_TriangleBarycentrics", "_RayQuery_TriangleFrontFace", "_RayQuery_WorldRayDirection", @@ -604,6 +604,17 @@ void LowerRayQuery::visitLdsStackInitOp(GpurtLdsStackInitOp &inst) { inst.setUseExtraStack(true); } +// ===================================================================================================================== +// Visits "lgc.gpurt.get.ray.static.id" instructions +// +// @param inst : The instruction +void LowerRayQuery::visitGetRayStaticIdOp(GpurtGetRayStaticIdOp &inst) { + auto hashcode = hash_combine(m_traceRayId++, inst.getFunction()->getName()); + inst.replaceAllUsesWith(m_builder->getInt32(hashcode)); + m_callsToLower.push_back(&inst); + m_funcsToLower.insert(inst.getCalledFunction()); +} + // ===================================================================================================================== // Executes this LowerRayquery pass on the specified LLVM module. // @@ -622,7 +633,6 @@ PreservedAnalyses LowerRayQuery::run(Module &module, ModuleAnalysisManager &anal static auto findRayqueryDialect = llvm_dialects::VisitorBuilder() - .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) .add([](FuncSet &funcSet, auto &inst) { funcSet.insert(inst.getFunction()); }) .build(); findRayqueryDialect.visit(rayQueryFuncs, module); @@ -679,21 +689,29 @@ PreservedAnalyses LowerRayQuery::run(Module &module, ModuleAnalysisManager &anal payload.typeLower.finishPhis(); payload.typeLower.finishCleanup(); - static auto postVisit = llvm_dialects::VisitorBuilder() - .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) .add(&LowerRayQuery::visitGetStaticFlagsOp) .add(&LowerRayQuery::visitStackReadOp) .add(&LowerRayQuery::visitStackWriteOp) .add(&LowerRayQuery::visitLdsStackInitOp) + .add(&LowerRayQuery::visitGetRayStaticIdOp) .build(); postVisit.visit(*this, module); m_typeLowering = nullptr; + + for (Instruction *call : m_callsToLower) { + call->dropAllReferences(); + call->eraseFromParent(); + } + m_callsToLower.clear(); + for (Function *func : m_funcsToLower) { func->dropAllReferences(); func->eraseFromParent(); } + m_funcsToLower.clear(); + return PreservedAnalyses::none(); } diff --git a/llvmraytracing/lib/LowerRaytracingPipeline.cpp b/llvmraytracing/lib/LowerRaytracingPipeline.cpp index f5d0cded8c..ac041a24ca 100644 --- a/llvmraytracing/lib/LowerRaytracingPipeline.cpp +++ b/llvmraytracing/lib/LowerRaytracingPipeline.cpp @@ -228,14 +228,18 @@ class ModuleMetadataState final { uint32_t getMaxPayloadRegisterCount() const { return MaxPayloadRegisterCount; } - std::optional tryGetPreservedPayloadRegisterCount() const { return PreservedPayloadRegisterCount; } - void updateMaxUsedPayloadRegisterCount(uint32_t Count) { MaxUsedPayloadRegisterCount = std::max(Count, MaxUsedPayloadRegisterCount); } uint32_t getMaxUsedPayloadRegisterCount() const { return MaxUsedPayloadRegisterCount; } + // Returns whether a value for maxUsedPayloadRegisterCount was set in the input module. + // If that is the case, for driver functions we rely on it. + // This mechanism ensures we don't rely on it in case the value was only initialized + // during processing of the current module. + bool maxUsedPayloadRegisterCountWasSet() const { return MaxUsedPayloadRegisterCountWasSet; } + uint32_t getMaxHitAttributeByteCount() const { return MaxHitAttributeByteCount; } bool isInLgcCpsMode() const { return IsInLgcCpsMode; } @@ -250,12 +254,13 @@ class ModuleMetadataState final { /// [In]: Maximum allowed number of registers to be used for the payload. /// It is guaranteed that all modules in a pipeline share this value. uint32_t MaxPayloadRegisterCount = 0; - /// [In]: If known, the number of payload registers that need to be preserved - /// by functions that don't know the payload type, e.g. Traversal. - std::optional PreservedPayloadRegisterCount = {}; - /// [Out]: The maximum number of payload registers written or read by any - /// shader in the module. This excludes intersection shaders, which - /// just pass through an existing payload. + /// [In/Out]: The maximum number of payload registers written or read by any + /// shader in the pipeline observed so far. + /// This excludes intersection shaders, which just pass through an existing payload. + /// If set on an incoming module, we can rely on it being an upper bound + /// for driver functions, because driver functions are compiled last and not + /// reused for child pipelines. + /// We can't rely on it when compiling app shaders (e.g. intersection). uint32_t MaxUsedPayloadRegisterCount = 0; /// [In]: The maximum size of hit attribute stored on the module as metadata. uint32_t MaxHitAttributeByteCount = 0; @@ -265,6 +270,8 @@ class ModuleMetadataState final { /// If the module has lgc.cps.module metadata attached. bool IsInLgcCpsMode = false; + + bool MaxUsedPayloadRegisterCountWasSet = false; }; class LowerRaytracingPipelinePassImpl final { @@ -611,14 +618,9 @@ ModuleMetadataState::ModuleMetadataState(Module &Module) : Mod{Module} { auto RegisterCountFromMD = ContHelper::MaxPayloadRegisterCount::tryGetValue(&Module); MaxPayloadRegisterCount = RegisterCountFromMD.value_or(DefaultPayloadRegisterCount); - // Check that if there is a required minimum number of payload registers, - // it is compatible - PreservedPayloadRegisterCount = ContHelper::PreservedPayloadRegisterCount::tryGetValue(&Module); - assert(PreservedPayloadRegisterCount.value_or(MaxPayloadRegisterCount) <= MaxPayloadRegisterCount); - - MaxUsedPayloadRegisterCount = ContHelper::MaxUsedPayloadRegisterCount::tryGetValue(&Module).value_or(0); - if (PreservedPayloadRegisterCount.has_value()) - MaxUsedPayloadRegisterCount = std::max(MaxUsedPayloadRegisterCount, PreservedPayloadRegisterCount.value()); + auto OptMaxUsedPayloadRegisterCount = ContHelper::MaxUsedPayloadRegisterCount::tryGetValue(&Module); + MaxUsedPayloadRegisterCount = OptMaxUsedPayloadRegisterCount.value_or(0); + MaxUsedPayloadRegisterCountWasSet = OptMaxUsedPayloadRegisterCount.has_value(); // Use max hit attribute size from metadata, or use globally max allowed // value for the max if metadata is not set @@ -961,6 +963,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy RetAddr = PoisonValue::get(Builder.getInt64Ty()); } else { RetAddr = Call->getArgOperand(RetAddrArgIndex); + assert(RetAddr->getType()->isIntegerTy(32) || RetAddr->getType()->isIntegerTy(64)); ++RetAddrArgIndex; } @@ -1430,9 +1433,9 @@ void LowerRaytracingPipelinePassImpl::setGpurtEntryRegisterCountMetadata() { // Even if PreservedPayloadRegisterCount is set, there may be // additional shaders in the current module whose usage is recorded // in MaxUsedPayloadRegisterCount, to take the max with it. - uint32_t MaxRegisterCount = - std::max(MetadataState.tryGetPreservedPayloadRegisterCount().value_or(MetadataState.getMaxPayloadRegisterCount()), - MetadataState.getMaxUsedPayloadRegisterCount()); + uint32_t MaxRegisterCount = MetadataState.maxUsedPayloadRegisterCountWasSet() + ? MetadataState.getMaxUsedPayloadRegisterCount() + : MetadataState.getMaxPayloadRegisterCount(); struct VisitorState { ModuleMetadataState &Metadata; @@ -1759,10 +1762,9 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData AllArgTypes.push_back(SystemDataTy); NewRetTy = SystemDataTy; - // We should have set up preserved register count for Traversal, if not, - // fall back to max count. - Data.NumPassedThroughPayloadDwords = - MetadataState.tryGetPreservedPayloadRegisterCount().value_or(MetadataState.getMaxPayloadRegisterCount()); + Data.NumPassedThroughPayloadDwords = MetadataState.maxUsedPayloadRegisterCountWasSet() + ? MetadataState.getMaxUsedPayloadRegisterCount() + : MetadataState.getMaxPayloadRegisterCount(); break; } default: @@ -2385,7 +2387,6 @@ PreservedAnalyses LowerRaytracingPipelinePassImpl::run() { static const auto Visitor = llvm_dialects::VisitorBuilder() - .setStrategy(llvm_dialects::VisitorStrategy::ByInstruction) .addSet( [](VisitorState &State, Instruction &Op) { auto *CInst = cast(&Op); diff --git a/llvmraytracing/lib/PipelineState.cpp b/llvmraytracing/lib/PipelineState.cpp new file mode 100644 index 0000000000..ffbda3b41d --- /dev/null +++ b/llvmraytracing/lib/PipelineState.cpp @@ -0,0 +1,116 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file PipelineState.cpp + * @brief Implementation of helpers for llvmraytracing pipeline state. + *********************************************************************************************************************** + */ + +#include "llvmraytracing/PipelineState.h" +#include "llvmraytracing/ContinuationsUtil.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" + +using namespace llvm; + +namespace { +// Constants used in the msgpack format +namespace MsgPackFormat { + +constexpr unsigned MajorVersion = 1; + +static constexpr char Version[] = "version"; +static constexpr char MaxUsedPayloadRegisterCount[] = "max_used_payload_register_count"; + +} // namespace MsgPackFormat +} // anonymous namespace + +namespace llvmraytracing { + +Expected PipelineState::decodeMsgpack(llvm::msgpack::DocNode &Root) { + auto &Node = Root.getMap(); + + auto GetUInt = [](msgpack::DocNode &Node, auto &Out) { + if (!Node.isEmpty()) + Out = Node.getUInt(); + }; + + uint64_t Version = 0; + GetUInt(Node[MsgPackFormat::Version], Version); + if (Version != MsgPackFormat::MajorVersion) + return make_error("bad/missing llvmraytracing pipelinestate version", inconvertibleErrorCode()); + + PipelineState State = {}; + GetUInt(Node[MsgPackFormat::MaxUsedPayloadRegisterCount], State.MaxUsedPayloadRegisterCount); + + return State; +} + +Expected PipelineState::decodeMsgpack(StringRef Data) { + msgpack::Document Doc; + + if (!Doc.readFromBlob(Data, false)) + return make_error("failed to parse msgpack", inconvertibleErrorCode()); + + auto &Root = Doc.getRoot().getMap(); + return decodeMsgpack(Root); +} + +void PipelineState::encodeMsgpack(llvm::msgpack::DocNode &Root) const { + auto &Node = Root.getMap(true); + Node[MsgPackFormat::Version] = MsgPackFormat::MajorVersion; + Node[MsgPackFormat::MaxUsedPayloadRegisterCount] = MaxUsedPayloadRegisterCount; +} + +std::string PipelineState::encodeMsgpack() const { + msgpack::Document Doc; + + auto &Root = Doc.getRoot().getMap(true); + encodeMsgpack(Root); + + std::string Out; + Doc.writeToBlob(Out); + return Out; +} + +llvm::Expected PipelineState::fromModuleMetadata(const llvm::Module &M) { + PipelineState State = {}; + auto OptMaxUsedPayloadRegCount = ContHelper::tryGetMaxUsedPayloadRegisterCount(M); + if (OptMaxUsedPayloadRegCount.has_value()) + State.MaxUsedPayloadRegisterCount = *OptMaxUsedPayloadRegCount; + return State; +} + +void PipelineState::exportModuleMetadata(llvm::Module &M) const { + if (MaxUsedPayloadRegisterCount) { + ContHelper::setMaxUsedPayloadRegisterCount(M, MaxUsedPayloadRegisterCount); + } +} + +void PipelineState::merge(const PipelineState &Other) { + MaxUsedPayloadRegisterCount = std::max(MaxUsedPayloadRegisterCount, Other.MaxUsedPayloadRegisterCount); +} + +} // namespace llvmraytracing diff --git a/llvmraytracing/test/dx/continuation-registercount.ll b/llvmraytracing/test/dx/continuation-registercount.ll index 5d0bdefd4e..1e7418dd21 100644 --- a/llvmraytracing/test/dx/continuation-registercount.ll +++ b/llvmraytracing/test/dx/continuation-registercount.ll @@ -39,10 +39,10 @@ declare i32 @_cont_GetContinuationStackAddr() #0 declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0 ; Function Attrs: alwaysinline -declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0 +declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) #0 ; Function Attrs: alwaysinline -declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #0 +declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, i64, %struct.AnyHitTraversalData) #0 ; Function Attrs: nounwind memory(read) declare !pointeetys !24 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %struct.HitData*) #1 @@ -102,7 +102,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i ; Function Attrs: alwaysinline define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !pointeetys !37 { %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4 - %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data) + %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data) store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4 call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) ret void @@ -111,7 +111,7 @@ define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !poi ; Function Attrs: alwaysinline define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !38 { %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4 - %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind) + %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, i64 poison, %struct.AnyHitTraversalData %trav_data) store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4 call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data) ret i1 true @@ -155,12 +155,16 @@ define void @called(%struct.MyParams* %arg) !pointeetys !39 { } ; MAX10-DAG: Incoming payload VGPR size of "Intersection" (intersection): 10 dwords -; MAX10-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 10 dwords +; MAX30-DAG: Incoming payload VGPR size of "Intersection" (intersection): 30 dwords ; COMMON-DAG: Outgoing payload VGPR size by jump: -; MAX10-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}: 10 dwords +; MAX10-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}}: 10 dwords +; MAX30-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}}: 30 dwords + +; MAX10-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 10 dwords ; MAX30-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 30 dwords ; COMMON-DAG: Outgoing payload VGPR size by jump: -; MAX30-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}: 30 dwords +; MAX10-DAG: call void (...) @lgc.cps.jump(i64 %returnAddr.reload{{.*}}: 10 dwords +; MAX30-DAG: call void (...) @lgc.cps.jump(i64 %returnAddr.reload{{.*}}: 30 dwords define void @Intersection() #3 { %a = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4 @@ -231,8 +235,8 @@ attributes #3 = { nounwind } !dx.entryPoints = !{!3, !6, !13, !15, !17, !19, !21, !57} !continuation.maxPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10 !continuation.maxPayloadRegisterCount = !{!53} ; 30; only for MAX_REG_30 -!continuation.preservedPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10 -!continuation.preservedPayloadRegisterCount = !{!54} ; 27; only for MAX_REG_30 +!continuation.maxUsedPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10 +!continuation.maxUsedPayloadRegisterCount = !{!54} ; 27; only for MAX_REG_30 !lgc.rt.max.attribute.size = !{!60} !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} diff --git a/llvmraytracing/test/dx/continuation-stacksize.ll b/llvmraytracing/test/dx/continuation-stacksize.ll index 590090f7f2..37861b2f0a 100644 --- a/llvmraytracing/test/dx/continuation-stacksize.ll +++ b/llvmraytracing/test/dx/continuation-stacksize.ll @@ -31,7 +31,7 @@ declare !pointeetys !33 i1 @_cont_ReportHit(%struct.TraversalData* %data, float declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0 ; Function Attrs: alwaysinline -declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0 +declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) #0 ; Function Attrs: alwaysinline declare !pointeetys !17 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0 @@ -61,7 +61,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i ; Function Attrs: alwaysinline define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !pointeetys !23 { %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4 - %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data) + %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data) store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4 call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) ret void @@ -147,6 +147,7 @@ attributes #3 = { nounwind memory(read) } !dx.valver = !{!1} !dx.shaderModel = !{!2} !dx.entryPoints = !{!3, !6, !13, !15} +!lgc.rt.max.attribute.size = !{!34} !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} !1 = !{i32 1, i32 6} @@ -182,3 +183,4 @@ attributes #3 = { nounwind memory(read) } !31 = !{i32 0, %struct.TheirParams2 poison} !32 = !{i32 0, %struct.TraversalData poison} !33 = !{%struct.TraversalData poison} +!34 = !{i32 8} diff --git a/llvmraytracing/test/dx/continuation-without-await.ll b/llvmraytracing/test/dx/continuation-without-await.ll index d0f0d6155b..b26552c00e 100644 --- a/llvmraytracing/test/dx/continuation-without-await.ll +++ b/llvmraytracing/test/dx/continuation-without-await.ll @@ -26,7 +26,7 @@ declare i32 @_cont_GetContinuationStackAddr() declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) -declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) +declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) declare !pointeetys !16 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) @@ -53,7 +53,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) !pointeetys !22 { %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4 - %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data) + %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data) store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4 call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) ret void @@ -94,6 +94,7 @@ attributes #2 = { nounwind } !dx.valver = !{!1} !dx.shaderModel = !{!2} !dx.entryPoints = !{!3, !6, !13, !14} +!lgc.rt.max.attribute.size = !{!29} !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} !1 = !{i32 1, i32 6} @@ -124,13 +125,14 @@ attributes #2 = { nounwind } !26 = !{i32 0, %struct.TheirParams poison} !27 = !{i32 0, %struct.TraversalData poison} !28 = !{%struct.TraversalData poison} +!29 = !{i32 8} ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex( ; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) { ; LOWERRAYTRACINGPIPELINE-NEXT: ret i32 5 ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define void @main( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META20:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META21:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4 @@ -142,15 +144,15 @@ attributes #2 = { nounwind } ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP8]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP3]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [9 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount [[META21]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } @await(ptr [[TMP5]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP6]], 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [2 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META22:![0-9]+]], !continuation.returnedRegistercount [[META22]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } @await(ptr [[TMP5]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP6]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [1 x i32] [[TMP11]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP6]], 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP6]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP18]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT:%.*]] @@ -160,7 +162,7 @@ attributes #2 = { nounwind } ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define void @main_no_call( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META20]] !continuation.registercount [[META8]] !continuation [[META23:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 @@ -170,7 +172,7 @@ attributes #2 = { nounwind } ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @called( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META24:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META24:![0-9]+]] !continuation.registercount [[META18:![0-9]+]] !continuation [[META25:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [3 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8 @@ -201,7 +203,7 @@ attributes #2 = { nounwind } ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP26]], ptr [[TMP24]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load [3 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [8 x i32] poison, [3 x i32] [[TMP28]]), !continuation.registercount [[META17]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [2 x i32] poison, [3 x i32] [[TMP28]]), !continuation.registercount [[META18]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -211,23 +213,23 @@ attributes #2 = { nounwind } ; ; ; CLEANUP-LABEL: define void @main( -; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.state [[META8]] { +; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META20:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META21:![0-9]+]] !continuation.state [[META8]] { ; CLEANUP-NEXT: AllocaSpillBB: ; CLEANUP-NEXT: [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0 ; CLEANUP-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; CLEANUP-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0 ; CLEANUP-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 0, 0 ; CLEANUP-NEXT: [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0) -; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP1]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount [[META21]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP1]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META22:![0-9]+]], !continuation.returnedRegistercount [[META22]] ; CLEANUP-NEXT: unreachable ; ; ; CLEANUP-LABEL: define dso_local void @main.resume.0( -; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META21]] !continuation [[META20]] { +; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [2 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META22]] !continuation [[META21]] { ; CLEANUP-NEXT: entryresume.0: -; CLEANUP-NEXT: [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 2 +; CLEANUP-NEXT: [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 2 ; CLEANUP-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP3]], 0 -; CLEANUP-NEXT: [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 0 +; CLEANUP-NEXT: [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0 ; CLEANUP-NEXT: [[DOTFCA_0_EXTRACT4:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP2]], 0 ; CLEANUP-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; CLEANUP-NEXT: ret void @@ -236,7 +238,7 @@ attributes #2 = { nounwind } ; ; ; CLEANUP-LABEL: define void @main_no_call( -; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] !continuation.state [[META8]] { +; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META20]] !continuation.registercount [[META8]] !continuation [[META23:![0-9]+]] !continuation.state [[META8]] { ; CLEANUP-NEXT: AllocaSpillBB: ; CLEANUP-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0 ; CLEANUP-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) @@ -246,7 +248,7 @@ attributes #2 = { nounwind } ; ; ; CLEANUP-LABEL: define void @called( -; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META24:![0-9]+]] !continuation.state [[META8]] { +; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META24:![0-9]+]] !continuation.registercount [[META18:![0-9]+]] !continuation [[META25:![0-9]+]] !continuation.state [[META8]] { ; CLEANUP-NEXT: AllocaSpillBB: ; CLEANUP-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i32] [[PAYLOAD]], 0 ; CLEANUP-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i32] [[PAYLOAD]], 1 @@ -257,7 +259,7 @@ attributes #2 = { nounwind } ; CLEANUP-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; CLEANUP-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; CLEANUP-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [8 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]), !continuation.registercount [[META17]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [2 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]), !continuation.registercount [[META18]] ; CLEANUP-NEXT: unreachable ; ; @@ -267,7 +269,7 @@ attributes #2 = { nounwind } ; ; ; POSTPROCESS-LABEL: define void @main( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation [[META20:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META20:![0-9]+]] !continuation [[META21:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -277,18 +279,18 @@ attributes #2 = { nounwind } ; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 0, 0 ; POSTPROCESS-NEXT: [[TMP2:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0) ; POSTPROCESS-NEXT: [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP1]], i64 [[TMP2]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP1]], i64 [[TMP2]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define dso_local void @main.resume.0( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation [[META20]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [2 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation [[META21]] { ; POSTPROCESS-NEXT: entryresume.0: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 2 +; POSTPROCESS-NEXT: [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 2 ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP3]], 0 -; POSTPROCESS-NEXT: [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 0 +; POSTPROCESS-NEXT: [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT4:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP2]], 0 ; POSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; POSTPROCESS-NEXT: ret void @@ -297,7 +299,7 @@ attributes #2 = { nounwind } ; ; ; POSTPROCESS-LABEL: define void @main_no_call( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation [[META21:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META20]] !continuation [[META22:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -309,7 +311,7 @@ attributes #2 = { nounwind } ; ; ; POSTPROCESS-LABEL: define void @called( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation [[META23:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation [[META24:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -323,6 +325,6 @@ attributes #2 = { nounwind } ; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 ; POSTPROCESS-NEXT: [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP1]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [8 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP1]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [2 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; diff --git a/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll b/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll new file mode 100644 index 0000000000..9e0aba31c2 --- /dev/null +++ b/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll @@ -0,0 +1,257 @@ +; Tests that if _cont_ExitRayGen ends with an enqueue, then we still free RayGen continuation state. +; This is a regression test, in an earlier version we only freed for returns and missed this case. +; RUN: grep -v "lgc.cps.module" %s | opt --verify-each -passes="dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck %s +; RUN: opt --verify-each -passes="dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck %s + +; There is just a single RayGen shader in this module, so any free must come from it. +; CHECK: call void @lgc.cps.free + +target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32" + +%dx.types.Handle = type { i8* } +%struct.DispatchSystemData = type { <3 x i32> } +%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i64 } +%struct.SystemData = type { %struct.DispatchSystemData } +%struct.HitData = type { <3 x float>, <3 x float>, float, i32 } +%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData } +%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> } +%struct.RayPayload = type { <4 x float> } +%dx.types.ResourceProperties = type { i32, i32 } +%struct.BuiltInTriangleIntersectionAttributes2 = type { <2 x float> } +%struct.RaytracingAccelerationStructure = type { i32 } +%"class.RWTexture2D >" = type { <4 x float> } + +@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4 +@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4 + +define i32 @_cont_GetContinuationStackAddr() #0 { + ret i32 0 +} + +declare void @_AmdEnqueue(i64, i64, %struct.SystemData) + +define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} { + call void @_AmdEnqueue(i64 1, i64 1, %struct.SystemData poison) + unreachable +} + +declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0 + +declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0 + +declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #0 + +declare !pointeetys !32 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData* %data) #0 + +declare !pointeetys !34 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0 + +declare !pointeetys !36 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0 + +declare !pointeetys !37 void @_cont_SetTriangleHitAttributes(%struct.SystemData* %data, %struct.BuiltInTriangleIntersectionAttributes %val) + +define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeetys !38 { + ret i32 5 +} + +declare i1 @opaqueIsEnd() + +define i1 @_cont_IsEndSearch(%struct.TraversalData*) #0 !pointeetys !40 { + %isEnd = call i1 @opaqueIsEnd() + ret i1 %isEnd +} + +declare !pointeetys !42 i32 @_cont_HitKind(%struct.SystemData*) #0 + +; Function Attrs: nounwind +declare i64 @_AmdGetResumePointAddr() #1 + +; Function Attrs: nounwind +declare !pointeetys !43 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1 + +; Function Attrs: nounwind +declare !pointeetys !44 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1 + +; Function Attrs: nounwind +declare !pointeetys !43 void @_cont_AcceptHitAndEndSearch(%struct.DispatchSystemData* nocapture readnone) #1 + +; Function Attrs: nounwind +declare !pointeetys !44 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone) #1 + +; Function Attrs: nounwind +declare !pointeetys !43 void @_cont_IgnoreHit(%struct.DispatchSystemData* nocapture readnone) #1 + +; Function Attrs: nounwind +declare !pointeetys !44 void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* nocapture readnone) #1 + +define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !45 { + %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4 + %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0 + %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0 + %addr = call i64 @_AmdGetResumePointAddr() #3 + %trav_data2 = insertvalue %struct.TraversalData %trav_data, i64 %addr, 5 + %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i64 4, %struct.TraversalData %trav_data2) + store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4 + call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) + ret void +} + +declare !pointeetys !46 void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0; + +declare !pointeetys !47 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 + +declare !pointeetys !48 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData* %data) + +declare !pointeetys !49 <3 x float> @_cont_ObjectRayOrigin3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) + +declare !pointeetys !49 <3 x float> @_cont_ObjectRayDirection3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) + +declare !pointeetys !51 float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) + +declare i32 @opaque() +declare void @use(i32) + +; Function Attrs: nounwind +define void @MyRayGen() #2 { + %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 + %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 + %3 = alloca %struct.RayPayload, align 4 + %4 = bitcast %struct.RayPayload* %3 to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* %4) #1 + %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0 + store <4 x float> zeroinitializer, <4 x float>* %5, align 4, !tbaa !52 + %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1) + %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 }) + ; Ensure continuation state + %cont.state = call i32 @opaque() + call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3) + call void @use(i32 %cont.state) + %8 = load <4 x float>, <4 x float>* %5, align 4, !tbaa !52 + %9 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0) + %10 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1) + %11 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2) + %12 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %11, %dx.types.ResourceProperties { i32 4098, i32 1033 }) + %13 = extractelement <4 x float> %8, i64 0 + %14 = extractelement <4 x float> %8, i64 1 + %15 = extractelement <4 x float> %8, i64 2 + %16 = extractelement <4 x float> %8, i64 3 + call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %12, i32 %9, i32 %10, i32 undef, float %13, float %14, float %15, float %16, i8 15) + call void @llvm.lifetime.end.p0i8(i64 16, i8* %4) #1 + ret void +} + +; Function Attrs: nounwind +declare !pointeetys !59 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #1 + +; Function Attrs: nounwind +declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #1 + +; Function Attrs: nounwind memory(none) +declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #3 + +; Function Attrs: nounwind memory(none) +declare float @dx.op.objectRayDirection.f32(i32, i8) #3 + +; Function Attrs: nounwind memory(none) +declare float @dx.op.objectRayOrigin.f32(i32, i8) #3 + +; Function Attrs: nounwind memory(read) +declare float @dx.op.rayTCurrent.f32(i32) #4 + +declare void @dx.op.acceptHitAndEndSearch(i32) #0 + +declare void @dx.op.ignoreHit(i32) #0 + +; Function Attrs: nounwind +declare !pointeetys !60 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #1 + +; Function Attrs: nounwind +declare !pointeetys !61 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes2(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes2*) #1 + +; Function Attrs: nounwind memory(none) +declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #3 + +; Function Attrs: nounwind memory(read) +declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare !pointeetys !63 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare !pointeetys !63 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5 + +attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind } +attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nounwind memory(read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } + +!llvm.ident = !{!0} +!dx.version = !{!1} +!dx.valver = !{!1} +!dx.shaderModel = !{!2} +!dx.resources = !{!3} +!dx.typeAnnotations = !{!10} +!dx.entryPoints = !{!18, !29 } +!lgc.cps.module = !{} + +!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} +!1 = !{i32 1, i32 6} +!2 = !{!"lib", i32 6, i32 6} +!3 = !{!4, !7, null, null} +!4 = !{!5} +!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6} +!6 = !{i32 0, i32 4} +!7 = !{!8} +!8 = !{i32 0, %"class.RWTexture2D >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9} +!9 = !{i32 0, i32 9} +!10 = !{i32 1, void ()* @MyRayGen, !11} +!11 = !{!12} +!12 = !{i32 1, !13, !13} +!13 = !{} +!14 = !{!12, !15, !16} +!15 = !{i32 2, !13, !13} +!16 = !{i32 0, !13, !13} +!17 = !{!12, !15} +!18 = !{null, !"", null, !3, !19} +!19 = !{i32 0, i64 65536} +!21 = !{i32 8, i32 9, i32 6, i32 16, i32 7, i32 8, i32 5, !22} +!22 = !{i32 0} +!24 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !22} +!26 = !{i32 8, i32 8, i32 5, !22} +!28 = !{i32 8, i32 11, i32 6, i32 16, i32 5, !22} +!29 = !{void ()* @MyRayGen, !"MyRayGen", null, null, !30} +!30 = !{i32 8, i32 7, i32 5, !22} +!32 = !{%struct.AnyHitTraversalData poison} +!33 = !{i32 0, %struct.AnyHitTraversalData poison} +!34 = !{%struct.SystemData poison} +!35 = !{i32 0, %struct.SystemData poison} +!36 = !{%struct.SystemData poison} +!37 = !{%struct.SystemData poison} +!38 = !{%struct.DispatchSystemData poison} +!39 = !{i32 0, %struct.DispatchSystemData poison} +!40 = !{%struct.TraversalData poison} +!41 = !{i32 0, %struct.TraversalData poison} +!42 = !{%struct.SystemData poison} +!43 = !{%struct.DispatchSystemData poison} +!44 = !{%struct.AnyHitTraversalData poison} +!45 = !{%struct.DispatchSystemData poison} +!46 = !{%struct.DispatchSystemData poison} +!47 = !{%struct.AnyHitTraversalData poison} +!48 = !{%struct.DispatchSystemData poison} +!49 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison} +!50 = !{i32 0, %struct.HitData poison} +!51 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison} +!52 = !{!53, !53, i64 0} +!53 = !{!"omnipotent char", !54, i64 0} +!54 = !{!"Simple C/C++ TBAA"} +!55 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison} +!56 = !{i32 0, %struct.RayPayload poison} +!57 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison} +!58 = !{%struct.RayPayload poison} +!59 = !{%struct.RayPayload poison} +!60 = !{%struct.BuiltInTriangleIntersectionAttributes poison} +!61 = !{%struct.BuiltInTriangleIntersectionAttributes2 poison} +!62 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes2 poison} +!63 = !{i8 poison} +!64 = !{i32 0, i8 poison} diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll index 03fef54610..e3de9f1614 100644 --- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll +++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll @@ -48,7 +48,7 @@ entry: !dx.entryPoints = !{!0, !3} !continuation.maxPayloadRegisterCount = !{!7} -!continuation.preservedPayloadRegisterCount = !{!8} +!continuation.maxUsedPayloadRegisterCount = !{!8} !0 = !{null, !"", null, !1, !6} !1 = !{!2, null, null, null} diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll index 6c498ff781..0c341d0fe3 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll @@ -26,7 +26,7 @@ define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeet ; Need _cont_ReportHit to get system data type declare !pointeetys !22 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) -declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) +declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) declare !pointeetys !15 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) @@ -38,7 +38,7 @@ define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwin define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) !pointeetys !18 { %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4 - %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data) + %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data) store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4 call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) ret void @@ -102,7 +102,7 @@ attributes #0 = { nounwind } ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP3]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [20 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [19 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } @await(ptr [[TMP8]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP9]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [1 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 @@ -136,7 +136,7 @@ attributes #0 = { nounwind } ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP3]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP4:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP5:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa19i32a1i32s(i32 2, i32 4, i32 5, [20 x i32] poison, [1 x i32] [[TMP4]]), !continuation.returnedRegistercount [[META14]], !continuation.registercount [[META14]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP5:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa19i32a1i32s(i32 2, i32 4, i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [19 x i32] poison, [1 x i32] [[TMP4]]), !continuation.returnedRegistercount [[META14]], !continuation.registercount [[META14]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP5]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [1 x i32] [[TMP6]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4 @@ -168,7 +168,7 @@ attributes #0 = { nounwind } ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 undef, 0 ; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0) ; POSTPROCESS-CPS-NEXT: [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP3]], i64 [[TMP4]], i32 5, [20 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP3]], i64 [[TMP4]], i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [19 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; ; diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll index 57527032aa..2c4aa40ae2 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll @@ -28,7 +28,7 @@ declare !pointeetys !27 void @_cont_SetTriangleHitAttributes(%struct.SystemData* declare %struct.DispatchSystemData @_cont_Traversal(%struct.TraversalData) #0 -declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #0 +declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, i64, %struct.AnyHitTraversalData) #0 declare !pointeetys !28 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0 @@ -79,7 +79,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !36 { %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4 - %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind) + %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, i64 poison, %struct.AnyHitTraversalData %trav_data) store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4 ret i1 true } @@ -181,7 +181,7 @@ declare !pointeetys !46 [4 x <3 x float>] @_cont_WorldToObject4x3(%struct.Dispat ; Function Attrs: nounwind define void @RayGen() #3 { ; LOWERRAYTRACINGPIPELINE-LABEL: define void @RayGen( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META29:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META18]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META30:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META18]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 @@ -190,7 +190,7 @@ define void @RayGen() #3 { ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @RayGen( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META28:![0-9]+]] !continuation.entry [[META13:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META29:![0-9]+]] !continuation.entry [[META13:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -206,7 +206,7 @@ define void @RayGen() #3 { ; Function Attrs: nounwind define void @Intersection() #3 { ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @Intersection( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META30:![0-9]+]] !continuation [[META31:![0-9]+]] !continuation.registercount [[META25:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META31:![0-9]+]] !continuation [[META32:![0-9]+]] !continuation.registercount [[META26:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 @@ -234,11 +234,11 @@ define void @Intersection() #3 { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP11]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float 4.000000e+00, i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]], [19 x i32] poison, [30 x i32] [[TMP13]]), !continuation.registercount [[META25]], !continuation.returnedRegistercount [[META25]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } @await(ptr [[TMP20]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP22]], 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]], {} poison, [30 x i32] [[TMP13]]), !continuation.registercount [[META26]], !continuation.returnedRegistercount [[META26]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } @await(ptr [[TMP20]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP22]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [30 x i32] [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP22]], 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP22]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP14]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[ISEND_I:%.*]] = call i1 @opaqueIsEnd() @@ -246,16 +246,16 @@ define void @Intersection() #3 { ; LOWERRAYTRACINGPIPELINE: 19: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], [8 x i32] poison, [30 x i32] [[TMP21]]), !continuation.registercount [[META25]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], [2 x i32] poison, [30 x i32] [[TMP21]]), !continuation.registercount [[META26]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 22: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META25]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]], [2 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META26]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @Intersection( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META29:![0-9]+]] !continuation [[META30:![0-9]+]] !continuation.stacksize [[META31:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META30:![0-9]+]] !continuation [[META31:![0-9]+]] !continuation.stacksize [[META25:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; DXILCONTPOSTPROCESS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 @@ -366,7 +366,7 @@ define void @Intersection() #3 { ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_29_INSERT96:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT93]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; DXILCONTPOSTPROCESS-NEXT: [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(ptr @Intersection.resume.0) ; DXILCONTPOSTPROCESS-NEXT: [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP11]], i64 [[TMP12]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_1_INSERT]], float 4.000000e+00, i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [19 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT96]]) +; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP11]], i64 [[TMP12]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_1_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], {} poison, [30 x i32] [[DOTFCA_29_INSERT96]]) ; DXILCONTPOSTPROCESS-NEXT: unreachable ; %1 = call float @dx.op.rayTMin.f32(i32 153) @@ -381,39 +381,38 @@ define void @Intersection() #3 { ; Function Attrs: nounwind define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !pointeetys !47 { ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @AnyHit( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation [[META33:![0-9]+]] !continuation.registercount [[META26:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META33:![0-9]+]] !continuation [[META34:![0-9]+]] !continuation.registercount [[META27:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store [4 x i32] [[PAYLOAD]], ptr [[TMP15]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP15]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP11]], ptr [[TMP10]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP15]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP25]], ptr [[TMP12]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP21]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP22]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP20]], ptr [[TMP16]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP26]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP29]], ptr [[TMP25]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP7]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP7]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP28]], ptr [[ORIGHITATTRS]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 @@ -450,22 +449,22 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_RAYPAYLOAD]] [[TMP55]], ptr [[TMP8]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = load i32, ptr [[TMP39]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP42]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP59:%.*]] = load i32, ptr [[TMP41]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP59]], ptr [[TMP60]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP61:%.*]] = load i32, ptr [[TMP63]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP39]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP41]], ptr [[TMP15]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP61:%.*]] = load i32, ptr [[TMP59]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP61]], ptr [[TMP62]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = load i32, ptr [[TMP67]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = load i32, ptr [[TMP60]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP64]], ptr [[TMP66]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP65:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP65]], ptr [[TMP6]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = load i32, ptr [[TMP65]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP67]], ptr [[TMP63]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP69:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP69]], ptr [[TMP6]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = load i32, ptr [[TMP70]], align 4 @@ -474,27 +473,21 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP57]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP56]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP58:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP73:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP58]], [8 x i32] poison, [10 x i32] [[TMP73]]), !continuation.registercount [[META26]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP72:%.*]] = load [4 x i32], ptr [[TMP15]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP58]], [2 x i32] poison, [4 x i32] [[TMP72]]), !continuation.registercount [[META27]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @AnyHit( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation [[META33:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation [[META33:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; DXILCONTPOSTPROCESS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 0, 0, 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store <3 x i32> [[DOTFCA_0_0_0_0_EXTRACT]], ptr [[DOTFCA_0_0_0_0_GEP]], align 4 @@ -625,18 +618,12 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_1_GEP21:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_1_GEP21]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_0_INSERT]], i32 [[DOTFCA_1_1_LOAD]], 1, 1 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP23]], 0 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP24]], 7 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[DOTFCA_2_EXTRACT]], 8 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[DOTFCA_3_EXTRACT]], 9 +; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP23]], 0 +; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP24]], 1 +; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[DOTFCA_2_EXTRACT]], 2 +; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[DOTFCA_3_EXTRACT]], 3 ; DXILCONTPOSTPROCESS-NEXT: [[TMP30:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP30]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP30]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; DXILCONTPOSTPROCESS-NEXT: unreachable ; %1 = call float @dx.op.rayTMin.f32(i32 153) @@ -654,33 +641,33 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil ; Function Attrs: nounwind define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !pointeetys !47 { ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @ClosestHit( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META35:![0-9]+]] !continuation.registercount [[META26]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [13 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] !continuation.registercount [[META27]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store [4 x i32] [[PAYLOAD]], ptr [[TMP13]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP13]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP17]], ptr [[TMP10]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP18]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP15]], ptr [[TMP12]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP19]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP22]], ptr [[TMP14]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP16]], ptr [[TMP5]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4 @@ -705,43 +692,37 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[RESPTR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[RES_I1:%.*]] = load i32, ptr [[RESPTR_I]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP31]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP35]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP33]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP36]], ptr [[TMP34]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP38]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP31]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP33]], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP35]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP43]], ptr [[TMP37]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = load i32, ptr [[TMP49]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = load i32, ptr [[TMP38]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP47]], ptr [[TMP48]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP53:%.*]] = load i32, ptr [[TMP50]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP53]], ptr [[TMP49]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP51]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP52]], [20 x i32] poison, [10 x i32] [[TMP45]]), !continuation.registercount [[META26]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = load [4 x i32], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP52]], [14 x i32] poison, [4 x i32] [[TMP45]]), !continuation.registercount [[META27]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META35:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [13 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META35:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; DXILCONTPOSTPROCESS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; DXILCONTPOSTPROCESS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[TMP0]], 0, 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store <3 x i32> [[DOTFCA_0_0_EXTRACT]], ptr [[DOTFCA_0_0_GEP]], align 4 @@ -780,18 +761,12 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP14]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [10 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[PAYLOAD_FCA_7_EXTRACT]], 7 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[PAYLOAD_FCA_8_EXTRACT]], 8 -; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[PAYLOAD_FCA_9_EXTRACT]], 9 +; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [4 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 +; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 +; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 +; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 ; DXILCONTPOSTPROCESS-NEXT: [[TMP19:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP19]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [20 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP19]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [14 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; DXILCONTPOSTPROCESS-NEXT: unreachable ; %1 = call float @dx.op.rayTMin.f32(i32 153) @@ -862,6 +837,7 @@ attributes #4 = { nounwind } !dx.resources = !{!3} !dx.typeAnnotations = !{!10} !dx.entryPoints = !{!14, !16, !19, !21, !23} +!lgc.rt.max.attribute.size = !{!51} !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} !1 = !{i32 1, i32 6} @@ -914,3 +890,4 @@ attributes #4 = { nounwind } !48 = !{i32 0, %struct.RayPayload poison} !49 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison} !50 = !{%struct.BuiltInTriangleIntersectionAttributes poison} +!51 = !{i32 8} diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll index 8d69eab346..1d5f3b5b9e 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll @@ -193,7 +193,7 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP9]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META17]], !waitmask [[META20:![0-9]+]], !continuation.returnedRegistercount [[META17]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } @await(ptr [[TMP19]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } @await.2(ptr [[TMP19]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP25]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [1 x i32] [[TMP13]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_SMALLPAYLOAD]] poison, ptr [[P1]], align 4 @@ -204,7 +204,7 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP11]], ptr [[TMP7]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT10:%.*]] -; LOWERRAYTRACINGPIPELINE: .split12: +; LOWERRAYTRACINGPIPELINE: .split: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP16]], align 4 @@ -249,7 +249,7 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], ptr [[TMP16]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT9:%.*]] -; LOWERRAYTRACINGPIPELINE: .split11: +; LOWERRAYTRACINGPIPELINE: .split.split: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I5:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP53]], align 4 @@ -281,7 +281,7 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP52]], ptr addrspace(32) [[TMP50]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP62:%.*]] = load [2 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[TMP62]]), !continuation.registercount [[META13]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } @await.2(ptr [[TMP63]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } @await(ptr [[TMP63]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP65:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP64]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [2 x i32] [[TMP65]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_LARGEPAYLOAD]] poison, ptr [[P3]], align 4 @@ -310,7 +310,7 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP75]], ptr [[TMP53]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT:%.*]] -; LOWERRAYTRACINGPIPELINE: .split: +; LOWERRAYTRACINGPIPELINE: .split.split.split: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP70:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[TMP2]], i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP84:%.*]] = load i32, ptr [[TMP70]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP84]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll index 95170965dc..ed75c1e686 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll @@ -28,7 +28,7 @@ declare i32 @_cont_GetContinuationStackAddr() declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) -declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) +declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) declare !pointeetys !13 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) @@ -49,7 +49,7 @@ declare !pointeetys !22 <3 x i32> @_cont_DispatchRaysDimensions3(%struct.Dispatc define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #1 !pointeetys !18 { %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4 - %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data) + %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data) store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4 call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) ret void @@ -79,6 +79,7 @@ attributes #1 = { alwaysinline } !dx.shaderModel = !{!2} !dx.entryPoints = !{!3, !6} !lgc.cps.module = !{} +!lgc.rt.max.attribute.size = !{!25} !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} !1 = !{i32 1, i32 6} @@ -105,6 +106,7 @@ attributes #1 = { alwaysinline } !22 = !{%struct.DispatchSystemData poison} !23 = !{i32 0, %struct.AnyHitTraversalData poison} !24 = !{%struct.AnyHitTraversalData poison} +!25 = !{i32 8} ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex( ; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) { @@ -112,7 +114,7 @@ attributes #1 = { alwaysinline } ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @called( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META14:![0-9]+]] !continuation [[META17:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !continuation.registercount [[META15:![0-9]+]] !continuation [[META18:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8 @@ -127,15 +129,15 @@ attributes #1 = { alwaysinline } ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP6]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [17 x i32] poison, [1 x i32] [[TMP7]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount [[META14]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } @await(ptr [[TMP11]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP12]], 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP7]]), !continuation.registercount [[META15]], !continuation.returnedRegistercount [[META15]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } @await(ptr [[TMP11]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP12]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [1 x i32] [[TMP13]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_MYPARAMS]] poison, ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP10]], ptr [[TMP9]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP12]], 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP12]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP8]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT:%.*]] @@ -149,7 +151,7 @@ attributes #1 = { alwaysinline } ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP22]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]], [16 x i32] poison, [1 x i32] [[TMP20]]), !continuation.registercount [[META14]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]], [10 x i32] poison, [1 x i32] [[TMP20]]), !continuation.registercount [[META15]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -159,7 +161,7 @@ attributes #1 = { alwaysinline } ; ; ; CLEANUP-LABEL: define void @called( -; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META14:![0-9]+]] !continuation [[META17:![0-9]+]] !continuation.stacksize [[META18:![0-9]+]] !continuation.state [[META18]] { +; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !continuation.registercount [[META15:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META13:![0-9]+]] !continuation.state [[META13]] { ; CLEANUP-NEXT: AllocaSpillBB: ; CLEANUP-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) ; CLEANUP-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 @@ -170,17 +172,17 @@ attributes #1 = { alwaysinline } ; CLEANUP-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT9]], 0 ; CLEANUP-NEXT: [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; CLEANUP-NEXT: [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0) -; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP2]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [17 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount [[META14]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP2]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META15]], !continuation.returnedRegistercount [[META15]] ; CLEANUP-NEXT: unreachable ; ; ; CLEANUP-LABEL: define dso_local void @called.resume.0( -; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [16 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META14]] !continuation [[META17]] { +; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META17]] !continuation.registercount [[META15]] !continuation [[META18]] { ; CLEANUP-NEXT: entryresume.0: ; CLEANUP-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8) -; CLEANUP-NEXT: [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 2 +; CLEANUP-NEXT: [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 2 ; CLEANUP-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP4]], 0 -; CLEANUP-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 0 +; CLEANUP-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0 ; CLEANUP-NEXT: [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], 0 ; CLEANUP-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CLEANUP-NEXT: [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 @@ -192,7 +194,7 @@ attributes #1 = { alwaysinline } ; CLEANUP-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0 ; CLEANUP-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0 ; CLEANUP-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]), !continuation.registercount [[META14]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]), !continuation.registercount [[META15]] ; CLEANUP-NEXT: unreachable ; ; @@ -202,7 +204,7 @@ attributes #1 = { alwaysinline } ; ; ; POSTPROCESS-LABEL: define void @called( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation [[META17:![0-9]+]] !continuation.stacksize [[META18:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META13:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -219,23 +221,23 @@ attributes #1 = { alwaysinline } ; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; POSTPROCESS-NEXT: [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0) ; POSTPROCESS-NEXT: [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [17 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP6]], i64 [[TMP7]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define dso_local void @called.resume.0( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [16 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation [[META17]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META17]] !continuation [[META18]] { ; POSTPROCESS-NEXT: entryresume.0: ; POSTPROCESS-NEXT: [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 0 +; POSTPROCESS-NEXT: [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0 ; POSTPROCESS-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP3]], ptr [[SYSTEM_DATA_ALLOCA1]], align 4 ; POSTPROCESS-NEXT: [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP2:%.*]] = add i32 [[TMP11]], -8 -; POSTPROCESS-NEXT: [[TMP12:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 2 +; POSTPROCESS-NEXT: [[TMP12:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 2 ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP12]], 0 -; POSTPROCESS-NEXT: [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 0 +; POSTPROCESS-NEXT: [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]], 0 ; POSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; POSTPROCESS-NEXT: [[TMP4:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21) @@ -251,7 +253,7 @@ attributes #1 = { alwaysinline } ; POSTPROCESS-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP9]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP10]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP10]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]) ; POSTPROCESS-NEXT: unreachable ; ; @@ -261,7 +263,7 @@ attributes #1 = { alwaysinline } ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @called( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] !continuation [[META19:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8 @@ -276,14 +278,14 @@ attributes #1 = { alwaysinline } ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP5]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa16i32a1i32s(i32 2, i32 4, i32 5, [17 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]] -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP7]], 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa10i32a1i32s(i32 2, i32 4, i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META15:![0-9]+]], !continuation.returnedRegistercount [[META15]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP7]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [1 x i32] [[TMP8]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_MYPARAMS]] poison, ptr [[TMP1]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP11]], ptr [[TMP10]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP7]], 0 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP7]], 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br label [[DOTSPLIT:%.*]] @@ -297,7 +299,7 @@ attributes #1 = { alwaysinline } ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP15]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP16:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP17:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP16]], [16 x i32] poison, [1 x i32] [[TMP17]]), !continuation.registercount [[META14]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP16]], [10 x i32] poison, [1 x i32] [[TMP17]]), !continuation.registercount [[META15]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; @@ -307,7 +309,7 @@ attributes #1 = { alwaysinline } ; ; ; CLEANUP-CPS-LABEL: define void @called( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] !continuation [[META19:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) ; CLEANUP-CPS-NEXT: [[RETURN_ADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 @@ -318,33 +320,33 @@ attributes #1 = { alwaysinline } ; CLEANUP-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[SYSTEM_DATA_FCA_0_EXTRACT]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 2, i32 4, {} poison, i64 [[TMP0]], i32 5, [17 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 2, i32 4, {} poison, i64 [[TMP0]], i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META15:![0-9]+]], !continuation.returnedRegistercount [[META15]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @called.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [16 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META17]] !continuation [[META18]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META17]] !lgc.cps [[META18]] !continuation [[META19]] { ; CLEANUP-CPS-NEXT: entryresume.0: -; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, align 8 -; CLEANUP-CPS-NEXT: store { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4 +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, align 8 +; CLEANUP-CPS-NEXT: store { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4 ; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8) -; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], 2 +; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 2 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP6]], 0 -; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], 0 +; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], 0 ; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CLEANUP-CPS-NEXT: [[RETURN_ADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[TMP5]], i32 0, i32 0 ; CLEANUP-CPS-NEXT: [[RETURN_ADDR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RETURN_ADDR_RELOAD_ADDR]], align 4 -; CLEANUP-CPS-NEXT: [[TMP8:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[TMP8:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; CLEANUP-CPS-NEXT: [[TMP9:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP8]]) ; CLEANUP-CPS-NEXT: [[A:%.*]] = extractelement <3 x i32> [[TMP9]], i8 0 -; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; CLEANUP-CPS-NEXT: [[TMP11:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP10]]) ; CLEANUP-CPS-NEXT: [[B:%.*]] = extractelement <3 x i32> [[TMP11]], i8 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT10]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META15]] ; CLEANUP-CPS-NEXT: unreachable ; ; @@ -354,7 +356,7 @@ attributes #1 = { alwaysinline } ; ; ; POSTPROCESS-CPS-LABEL: define void @called( -; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META16:![0-9]+]] !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] { +; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META17:![0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !lgc.cps [[META19:![0-9]+]] { ; POSTPROCESS-CPS-NEXT: AllocaSpillBB: ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -371,31 +373,31 @@ attributes #1 = { alwaysinline } ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0) ; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP4]], i64 [[TMP5]], i32 5, [17 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP4]], i64 [[TMP5]], i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]) ; POSTPROCESS-CPS-NEXT: unreachable ; ; ; POSTPROCESS-CPS-LABEL: define dso_local void @called.resume.0( -; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [16 x i32], [1 x i32] } [[TMP3:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] !lgc.cps [[META18]] { +; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP3:%.*]]) !continuation [[META17]] !lgc.rt.shaderstage [[META18]] !lgc.cps [[META19]] { ; POSTPROCESS-CPS-NEXT: entryresume.0: -; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, align 8 +; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, align 8 ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: store { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4 +; POSTPROCESS-CPS-NEXT: store { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], -8 -; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], 2 +; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 2 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP7]], 0 -; POSTPROCESS-CPS-NEXT: [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], 0 +; POSTPROCESS-CPS-NEXT: [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP8]], 0 ; POSTPROCESS-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; POSTPROCESS-CPS-NEXT: [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(21) ; POSTPROCESS-CPS-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP9]], i32 0 ; POSTPROCESS-CPS-NEXT: [[RETURN_ADDR_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP10]], align 4 -; POSTPROCESS-CPS-NEXT: [[TMP11:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0 +; POSTPROCESS-CPS-NEXT: [[TMP11:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; POSTPROCESS-CPS-NEXT: [[TMP12:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP11]]) ; POSTPROCESS-CPS-NEXT: [[A:%.*]] = extractelement <3 x i32> [[TMP12]], i8 0 -; POSTPROCESS-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0 +; POSTPROCESS-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; POSTPROCESS-CPS-NEXT: [[TMP14:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP13]]) ; POSTPROCESS-CPS-NEXT: [[B:%.*]] = extractelement <3 x i32> [[TMP14]], i8 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT10]], 0 @@ -405,6 +407,6 @@ attributes #1 = { alwaysinline } ; POSTPROCESS-CPS-NEXT: store i32 [[TMP16]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP17:%.*]] = zext i32 [[RETURN_ADDR_RELOAD]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP18:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP17]], i32 [[TMP18]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP17]], i32 [[TMP18]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; diff --git a/llvmraytracing/test/dx/lower-rt-pipeline.ll b/llvmraytracing/test/dx/lower-rt-pipeline.ll index e595b929d8..54e4b0cdc6 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline.ll @@ -35,7 +35,7 @@ declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalDat declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0 -declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #0 +declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, i64, %struct.AnyHitTraversalData) #0 define %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData* %data) #0 !pointeetys !32 { %resPtr = getelementptr %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, i32 0, i32 0 @@ -115,7 +115,7 @@ define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hi callAHit: ; preds = %0 %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4 - %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind) + %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, i64 poison, %struct.AnyHitTraversalData %trav_data) store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4 call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data) ret i1 true @@ -351,6 +351,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re !dx.typeAnnotations = !{!10} !dx.entryPoints = !{!18, !20, !23, !25, !27, !29, !31} !lgc.cps.module = !{} +!lgc.rt.max.attribute.size = !{!65} !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} !1 = !{i32 1, i32 6} @@ -417,6 +418,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re !62 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes2 poison} !63 = !{i8 poison} !64 = !{i32 0, i8 poison} +!65 = !{i32 8} ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetContinuationStackAddr( ; LOWERRAYTRACINGPIPELINE-SAME: ) #[[ATTR0:[0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: ret i32 0 @@ -497,9 +499,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META22]] !continuation [[META35:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META22]] !continuation [[META36:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 @@ -508,7 +510,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA36:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) @@ -518,53 +520,53 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP11]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP10]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP13]], ptr [[TMP37]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP15]], ptr [[TMP38]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP17]], ptr [[TMP18]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP39:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [16 x i32] poison, [10 x i32] [[TMP39]]), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount [[META33]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } @await(ptr [[TMP40]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP41]], 2 -; LOWERRAYTRACINGPIPELINE-NEXT: store [10 x i32] [[TMP42]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 3 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP21]], ptr [[TMP20]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP43]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = load [4 x i32], ptr [[TMP37]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP39:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [4 x i32] [[TMP31]]), !continuation.registercount [[META34:![0-9]+]], !continuation.returnedRegistercount [[META34]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } @await(ptr [[TMP39]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP41]], 2 +; LOWERRAYTRACINGPIPELINE-NEXT: store [4 x i32] [[TMP42]], ptr [[TMP37]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP37]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP23]], ptr [[TMP22]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP44]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP45]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP27]], ptr [[TMP26]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP41]], 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP47]], ptr [[TMP43]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP41]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP19]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT:%.*]] ; LOWERRAYTRACINGPIPELINE: .split: -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA36]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP30]], i8 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP46]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[TMP28]], i64 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP28]], i64 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP28]], i64 2 @@ -576,30 +578,30 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyClosestHitShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META40:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META41:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP39:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store [4 x i32] [[PAYLOAD]], ptr [[TMP39]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP6]], ptr [[TMP5]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP39]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP8]], ptr [[TMP7]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP8]], ptr [[TMP5]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP41]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP10]], ptr [[TMP9]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP10]], ptr [[TMP7]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP42]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP32]], ptr [[TMP30]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP13]], ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP2]], align 4 @@ -622,29 +624,29 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> [[TMP27]], ptr [[TMP28]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP30]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP32]], ptr [[TMP43]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP29]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP43]], ptr [[TMP39]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP34]], ptr [[TMP44]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP36]], ptr [[TMP40]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP36]], ptr [[TMP46]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP48]], ptr [[TMP40]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP37]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [33 x i32] poison, [10 x i32] [[TMP45]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = load [4 x i32], ptr [[TMP39]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [27 x i32] poison, [4 x i32] [[TMP45]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyAnyHitShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META42:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META43:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -655,32 +657,31 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store [4 x i32] [[PAYLOAD]], ptr [[TMP17]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP17]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP18]], ptr [[TMP16]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP25]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP21]], ptr [[TMP19]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP19]], ptr [[TMP16]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP26]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP24]], ptr [[TMP20]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP26]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP21]], ptr [[TMP25]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP24]], ptr [[TMP27]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP22]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP11]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = load i32, ptr [[TMP11]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP42]], ptr [[ORIGHITATTRS]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 @@ -727,174 +728,172 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP36:%.*]] = fcmp fast ogt float [[TMP34]], 1.000000e+00 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP34]], -1.000000e+00 ; LOWERRAYTRACINGPIPELINE-NEXT: br i1 [[TMP35]], label [[TMP38:%.*]], label [[TMP73:%.*]] -; LOWERRAYTRACINGPIPELINE: 42: +; LOWERRAYTRACINGPIPELINE: 41: ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> [[TMP29]], ptr [[TMP28]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP56:%.*]] -; LOWERRAYTRACINGPIPELINE: 43: +; LOWERRAYTRACINGPIPELINE: 42: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_cont_AcceptHitAndEndSearch(ptr [[TMP40]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = load i32, ptr [[TMP41]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP46]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP49]], ptr [[TMP43]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP52]], ptr [[TMP50]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP65:%.*]] = load i32, ptr [[TMP62]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP65]], ptr [[TMP47]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP66:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP66]], ptr [[TMP10]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP81:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP59:%.*]] = load i32, ptr [[TMP68]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP59]], ptr [[TMP81]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP59:%.*]] = load i32, ptr [[TMP41]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP59]], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP48]], ptr [[TMP46]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP50]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP51]], ptr [[TMP49]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = load i32, ptr [[TMP60]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP63]], ptr [[TMP52]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP64]], ptr [[TMP10]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP81]], ptr [[TMP80]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP53:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP10]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP53]], ptr [[TMP54]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP55:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]], [8 x i32] poison, [10 x i32] [[TMP63]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP62:%.*]] = load [4 x i32], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]], [2 x i32] poison, [4 x i32] [[TMP62]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable -; LOWERRAYTRACINGPIPELINE: 64: +; LOWERRAYTRACINGPIPELINE: 63: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_cont_AcceptHitAndEndSearch(ptr [[TMP57]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = load i32, ptr [[TMP58]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP67]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP69]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP82]], ptr [[TMP60]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP86:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP87:%.*]] = load i32, ptr [[TMP86]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP87]], ptr [[TMP85]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP88]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP97]], ptr [[TMP64]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP105:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP105]], ptr [[TMP9]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP78]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP80]], ptr [[TMP111]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP58]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP66]], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP69:%.*]] = load i32, ptr [[TMP68]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP69]], ptr [[TMP67]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP82:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP84:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP87:%.*]] = load i32, ptr [[TMP84]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP87]], ptr [[TMP82]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP75:%.*]] = load i32, ptr [[TMP74]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP75]], ptr [[TMP88]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP103:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP103]], ptr [[TMP9]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP104:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP79:%.*]] = load i32, ptr [[TMP104]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP79]], ptr [[TMP78]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP70:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP9]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP70]], ptr [[TMP71]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP72:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP84:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP72]], [8 x i32] poison, [10 x i32] [[TMP84]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP83:%.*]] = load [4 x i32], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP72]], [2 x i32] poison, [4 x i32] [[TMP83]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable +; LOWERRAYTRACINGPIPELINE: 84: +; LOWERRAYTRACINGPIPELINE-NEXT: br i1 [[TMP37]], label [[TMP85:%.*]], label [[TMP128:%.*]] ; LOWERRAYTRACINGPIPELINE: 85: -; LOWERRAYTRACINGPIPELINE-NEXT: br i1 [[TMP37]], label [[TMP74:%.*]], label [[TMP109:%.*]] +; LOWERRAYTRACINGPIPELINE-NEXT: br i1 [[TMP36]], label [[TMP86:%.*]], label [[TMP109:%.*]] ; LOWERRAYTRACINGPIPELINE: 86: -; LOWERRAYTRACINGPIPELINE-NEXT: br i1 [[TMP36]], label [[TMP75:%.*]], label [[TMP92:%.*]] -; LOWERRAYTRACINGPIPELINE: 87: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP76:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_cont_IgnoreHit(ptr [[TMP76]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP113:%.*]] = load i32, ptr [[TMP77]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP113]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP119:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP127:%.*]] = load i32, ptr [[TMP119]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP127]], ptr [[TMP79]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP128:%.*]] = getelementptr inbounds i32, ptr [[TMP79]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[TMP119]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP129:%.*]] = load i32, ptr [[TMP95]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP129]], ptr [[TMP128]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[TMP79]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP98:%.*]] = getelementptr inbounds i32, ptr [[TMP119]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP98]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP99]], ptr [[TMP83]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP131:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP101:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP101]], ptr [[TMP8]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP102:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP102]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP104]], ptr [[TMP103]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP105]], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP126:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP92:%.*]] = load i32, ptr [[TMP126]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP92]], ptr [[TMP125]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP129:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP144:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP95:%.*]] = load i32, ptr [[TMP144]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP95]], ptr [[TMP129]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP96:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP97]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP98]], ptr [[TMP96]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP99:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP99]], ptr [[TMP8]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP101:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP102:%.*]] = load i32, ptr [[TMP100]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP102]], ptr [[TMP101]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP89:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP8]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP90:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP89]], ptr [[TMP90]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP91:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP132:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP91]], [8 x i32] poison, [10 x i32] [[TMP132]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP145:%.*]] = load [4 x i32], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP91]], [2 x i32] poison, [4 x i32] [[TMP145]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable -; LOWERRAYTRACINGPIPELINE: 109: +; LOWERRAYTRACINGPIPELINE: 107: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP93:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_cont_IgnoreHit(ptr [[TMP93]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP134:%.*]] = load i32, ptr [[TMP94]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP134]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP96:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP115:%.*]] = load i32, ptr [[TMP114]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP115]], ptr [[TMP96]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[TMP96]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP114]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP118:%.*]] = load i32, ptr [[TMP117]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP118]], ptr [[TMP140]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP96]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[TMP114]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP121]], ptr [[TMP100]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP147:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP148:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP148]], ptr [[TMP7]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP149:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP149]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP126]], ptr [[TMP125]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP146:%.*]] = load i32, ptr [[TMP94]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP146]], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP113:%.*]] = load i32, ptr [[TMP112]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP113]], ptr [[TMP111]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP116]], ptr [[TMP114]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP118:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP119:%.*]] = load i32, ptr [[TMP118]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP119]], ptr [[TMP117]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP120:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP120]], ptr [[TMP7]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP121:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP148:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP149:%.*]] = load i32, ptr [[TMP121]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP149]], ptr [[TMP148]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP106:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP106]], ptr [[TMP107]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP108:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP130:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP108]], [8 x i32] poison, [10 x i32] [[TMP130]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP127:%.*]] = load [4 x i32], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP108]], [2 x i32] poison, [4 x i32] [[TMP127]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable -; LOWERRAYTRACINGPIPELINE: 131: +; LOWERRAYTRACINGPIPELINE: 128: ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> [[TMP29]], ptr [[TMP28]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP110:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP133:%.*]] = load i32, ptr [[TMP110]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP133]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP130:%.*]] = load i32, ptr [[TMP110]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP130]], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP131:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP132:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP133:%.*]] = load i32, ptr [[TMP132]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP133]], ptr [[TMP131]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP134:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP136]], ptr [[TMP112]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[TMP112]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[TMP135]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP136]], ptr [[TMP134]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 3 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP139:%.*]] = load i32, ptr [[TMP138]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP139]], ptr [[TMP137]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP116:%.*]] = getelementptr inbounds i32, ptr [[TMP112]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP141:%.*]] = getelementptr inbounds i32, ptr [[TMP135]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP142:%.*]] = load i32, ptr [[TMP141]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP142]], ptr [[TMP116]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP143:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP143]], ptr [[TMP6]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP144:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP145:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP146:%.*]] = load i32, ptr [[TMP144]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP146]], ptr [[TMP145]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP140:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP140]], ptr [[TMP6]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP141:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP142:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP143:%.*]] = load i32, ptr [[TMP141]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP143]], ptr [[TMP142]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP122:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP6]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP123:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP122]], ptr [[TMP123]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP124:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP150:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP124]], [8 x i32] poison, [10 x i32] [[TMP150]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP147:%.*]] = load [4 x i32], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP124]], [2 x i32] poison, [4 x i32] [[TMP147]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyIntersectionShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META44:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation.registercount [[META33:![0-9]+]] !continuation [[META45:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4 @@ -919,11 +918,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], [32 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } @await.1(ptr [[TMP23]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP24]], 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], {} poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } @await.1(ptr [[TMP23]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [30 x i32] [[TMP26]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP24]], 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP10]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) @@ -947,18 +946,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE: 23: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [2 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 26: ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyIntersectionShader2( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43]] !continuation.registercount [[META32]] !continuation [[META45:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META44]] !continuation.registercount [[META33]] !continuation [[META46:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2:%.*]], align 4 @@ -983,11 +982,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]], ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP8]], [32 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } @await.2(ptr [[TMP23]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP24]], 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP8]], {} poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } @await.2(ptr [[TMP23]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [30 x i32] [[TMP26]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP24]], 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP10]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) @@ -1011,61 +1010,61 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE: 23: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [2 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 26: ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyMissShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META47:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !continuation.registercount [[META34]] !continuation [[META47:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store [4 x i32] [[PAYLOAD]], ptr [[TMP23]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP5]], ptr [[TMP4]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP23]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP7]], ptr [[TMP6]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP7]], ptr [[TMP4]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP25]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP9]], ptr [[TMP6]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP26]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP11]], ptr [[TMP10]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP16]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> , ptr [[TMP12]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP14]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP16]], ptr [[TMP27]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP27]], ptr [[TMP23]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP18]], ptr [[TMP28]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP20]], ptr [[TMP24]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP20]], ptr [[TMP30]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP32]], ptr [[TMP24]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP21]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], [33 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = load [4 x i32], ptr [[TMP23]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], [27 x i32] poison, [4 x i32] [[TMP29]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -1149,9 +1148,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyRayGen( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP12:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 @@ -1160,7 +1159,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA38:![0-9]+]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) @@ -1170,46 +1169,46 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP11]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP14]], ptr [[TMP12]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP20]], ptr [[TMP18]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa33i32a10i32s(i32 4, i32 8, i32 5, [36 x i32] poison, [10 x i32] [[TMP21]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP22]], 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [10 x i32] [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 3 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP26:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP27]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP21:%.*]] = load [4 x i32], ptr [[TMP12]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa27i32a4i32s(i32 4, i32 8, i32 5, [30 x i32] poison, [4 x i32] [[TMP21]]), !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP22]], 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [4 x i32] [[TMP23]], ptr [[TMP12]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP12]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP32]], ptr [[TMP30]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP35]], ptr [[TMP33]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP22]], 0 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP47]], ptr [[TMP45]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP22]], 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP24]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br label [[DOTSPLIT:%.*]] ; LOWERRAYTRACINGPIPELINE-CPS: .split: -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA38]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP37:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() @@ -1227,29 +1226,29 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyClosestHitShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40:![0-9]+]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41:![0-9]+]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP5:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [4 x i32] [[PAYLOAD]], ptr [[TMP5]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP4:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP4]], ptr [[TMP3]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP7]], ptr [[TMP6]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP7]], ptr [[TMP3]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP10]], ptr [[TMP8]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP13]], ptr [[TMP11]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP13]], ptr [[TMP8]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP34]], ptr [[TMP11]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP14:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP14]], ptr [[TMP1]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP1]], align 4 @@ -1273,28 +1272,28 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> [[TMP28]], ptr [[TMP29]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP31]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP34]], ptr [[TMP32]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP31]], ptr [[TMP5]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP37]], ptr [[TMP35]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP37]], ptr [[TMP32]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP39]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP40]], ptr [[TMP38]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP40]], ptr [[TMP35]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP45]], ptr [[TMP38]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP42:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP41]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP43:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP42]], [33 x i32] poison, [10 x i32] [[TMP43]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP43:%.*]] = load [4 x i32], ptr [[TMP5]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP42]], [27 x i32] poison, [4 x i32] [[TMP43]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyAnyHitShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META34]] !continuation [[META44:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -1305,31 +1304,30 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP13:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [4 x i32] [[PAYLOAD]], ptr [[TMP13]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP12:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP13]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP13]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP18]], ptr [[TMP16]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP16]], ptr [[TMP14]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP21]], ptr [[TMP19]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP20]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP18]], ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP19]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP21]], ptr [[TMP24]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP22]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP9]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP9]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP25]], ptr [[ORIGHITATTRS]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 @@ -1376,174 +1374,172 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], 1.000000e+00 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP38:%.*]] = fcmp fast ogt float [[TMP35]], -1.000000e+00 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP82:%.*]] -; LOWERRAYTRACINGPIPELINE-CPS: 39: +; LOWERRAYTRACINGPIPELINE-CPS: 38: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> [[TMP30]], ptr [[TMP29]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br i1 [[TMP37]], label [[TMP40:%.*]], label [[TMP61:%.*]] -; LOWERRAYTRACINGPIPELINE-CPS: 40: +; LOWERRAYTRACINGPIPELINE-CPS: 39: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @_cont_AcceptHitAndEndSearch(ptr [[TMP41]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP43]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP46:%.*]] = load i32, ptr [[TMP45]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP46]], ptr [[TMP44]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP49]], ptr [[TMP47]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP52]], ptr [[TMP50]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP53:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP53]], ptr [[TMP8]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP56:%.*]] = load i32, ptr [[TMP54]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP56]], ptr [[TMP55]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP43]], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP45]], ptr [[TMP56]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP48]], ptr [[TMP46]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP50]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP51]], ptr [[TMP49]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP52:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP52]], ptr [[TMP8]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP55:%.*]] = load i32, ptr [[TMP53]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP55]], ptr [[TMP54]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP57:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP8]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP57]], ptr [[TMP58]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP59:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP60:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP59]], [8 x i32] poison, [10 x i32] [[TMP60]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP60:%.*]] = load [4 x i32], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP59]], [2 x i32] poison, [4 x i32] [[TMP60]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable -; LOWERRAYTRACINGPIPELINE-CPS: 61: +; LOWERRAYTRACINGPIPELINE-CPS: 60: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @_cont_AcceptHitAndEndSearch(ptr [[TMP62]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP63:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP64]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP67:%.*]] = load i32, ptr [[TMP66]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP67]], ptr [[TMP65]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[TMP65]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP70]], ptr [[TMP68]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP65]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP73:%.*]] = load i32, ptr [[TMP72]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP73]], ptr [[TMP71]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP74:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP74]], ptr [[TMP7]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP77:%.*]] = load i32, ptr [[TMP75]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP77]], ptr [[TMP76]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP77:%.*]] = load i32, ptr [[TMP63]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP77]], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP66]], ptr [[TMP64]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP69:%.*]] = load i32, ptr [[TMP68]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP69]], ptr [[TMP67]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP72:%.*]] = load i32, ptr [[TMP71]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP72]], ptr [[TMP70]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP73:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP73]], ptr [[TMP7]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP74]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP76]], ptr [[TMP75]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP78:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP78]], ptr [[TMP79]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP80:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP81:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP80]], [8 x i32] poison, [10 x i32] [[TMP81]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP81:%.*]] = load [4 x i32], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP80]], [2 x i32] poison, [4 x i32] [[TMP81]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable +; LOWERRAYTRACINGPIPELINE-CPS: 81: +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br i1 [[TMP38]], label [[TMP84:%.*]], label [[TMP141:%.*]] ; LOWERRAYTRACINGPIPELINE-CPS: 82: -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br i1 [[TMP38]], label [[TMP83:%.*]], label [[TMP128:%.*]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br i1 [[TMP37]], label [[TMP83:%.*]], label [[TMP105:%.*]] ; LOWERRAYTRACINGPIPELINE-CPS: 83: -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br i1 [[TMP37]], label [[TMP84:%.*]], label [[TMP106:%.*]] -; LOWERRAYTRACINGPIPELINE-CPS: 84: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP85:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @_cont_IgnoreHit(ptr [[TMP85]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP86:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP87:%.*]] = load i32, ptr [[TMP86]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP87]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP89]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP90]], ptr [[TMP88]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[TMP88]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP92:%.*]] = getelementptr inbounds i32, ptr [[TMP89]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP93:%.*]] = load i32, ptr [[TMP92]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP93]], ptr [[TMP91]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[TMP88]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[TMP89]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP96]], ptr [[TMP94]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP97:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP98:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP98]], ptr [[TMP6]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP99:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP99]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP101]], ptr [[TMP100]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP100:%.*]] = load i32, ptr [[TMP86]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP100]], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP87:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP88]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP89]], ptr [[TMP87]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP92:%.*]] = load i32, ptr [[TMP91]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP92]], ptr [[TMP90]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP93:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP95:%.*]] = load i32, ptr [[TMP94]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP95]], ptr [[TMP93]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP96:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP96]], ptr [[TMP6]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP98:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP97]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP99]], ptr [[TMP98]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP102:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP6]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP103:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP102]], ptr [[TMP103]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP104:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP105:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP104]], [8 x i32] poison, [10 x i32] [[TMP105]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP106:%.*]] = load [4 x i32], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP104]], [2 x i32] poison, [4 x i32] [[TMP106]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable -; LOWERRAYTRACINGPIPELINE-CPS: 106: +; LOWERRAYTRACINGPIPELINE-CPS: 104: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @_cont_IgnoreHit(ptr [[TMP107]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP108:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP109:%.*]] = load i32, ptr [[TMP108]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP109]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP108]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP112:%.*]] = load i32, ptr [[TMP111]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP112]], ptr [[TMP110]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP113:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP111]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP115:%.*]] = load i32, ptr [[TMP114]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP115]], ptr [[TMP113]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP116:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP111]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP118:%.*]] = load i32, ptr [[TMP117]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP118]], ptr [[TMP116]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP119:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP120:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP120]], ptr [[TMP5]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP121:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP122:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP123:%.*]] = load i32, ptr [[TMP121]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP123]], ptr [[TMP122]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP108]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP121]], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP122:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP109:%.*]] = getelementptr inbounds i32, ptr [[TMP108]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP109]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP110]], ptr [[TMP122]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[TMP108]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP113:%.*]] = load i32, ptr [[TMP112]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP113]], ptr [[TMP111]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[TMP108]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP116]], ptr [[TMP114]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP117:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP117]], ptr [[TMP5]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP118:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP119:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP120:%.*]] = load i32, ptr [[TMP118]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP120]], ptr [[TMP119]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP124:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP5]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP125:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP124]], ptr [[TMP125]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP126:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP127:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP126]], [8 x i32] poison, [10 x i32] [[TMP127]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP142:%.*]] = load [4 x i32], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP126]], [2 x i32] poison, [4 x i32] [[TMP142]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable -; LOWERRAYTRACINGPIPELINE-CPS: 128: +; LOWERRAYTRACINGPIPELINE-CPS: 125: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> [[TMP30]], ptr [[TMP29]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP129:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP130:%.*]] = load i32, ptr [[TMP129]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP130]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP131:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP132:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP127:%.*]] = load i32, ptr [[TMP129]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP127]], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP128:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP143:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP130:%.*]] = load i32, ptr [[TMP143]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP130]], ptr [[TMP128]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP131:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP132:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP133:%.*]] = load i32, ptr [[TMP132]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP133]], ptr [[TMP131]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP134:%.*]] = getelementptr inbounds i32, ptr [[TMP131]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP132]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP134:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i32 3 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP136]], ptr [[TMP134]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[TMP131]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[TMP132]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP139:%.*]] = load i32, ptr [[TMP138]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP139]], ptr [[TMP137]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP140:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP140]], ptr [[TMP4]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP141:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP142:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP143:%.*]] = load i32, ptr [[TMP141]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP143]], ptr [[TMP142]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP137:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP137]], ptr [[TMP4]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP139:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP140:%.*]] = load i32, ptr [[TMP138]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP140]], ptr [[TMP139]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP144:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP145:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP144]], ptr [[TMP145]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP146:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP147:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP146]], [8 x i32] poison, [10 x i32] [[TMP147]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP147:%.*]] = load [4 x i32], ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP146]], [2 x i32] poison, [4 x i32] [[TMP147]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4 @@ -1568,10 +1564,10 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [32 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]] -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa2i32a30i32s(i32 3, i32 16, i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], {} poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP9]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP9]], 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP11]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) @@ -1595,18 +1591,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS: 21: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE-CPS: 24: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP25:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP26:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [8 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [2 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader2( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META45]] !continuation [[META47:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META45]] !continuation [[META47:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2:%.*]], align 4 @@ -1631,10 +1627,10 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]], ptr [[TMP3]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP7]], [32 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]] -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa2i32a30i32s(i32 3, i32 16, i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP7]], {} poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP9]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP9]], 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP11]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) @@ -1658,60 +1654,60 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS: 21: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE-CPS: 24: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP25:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP26:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [8 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [2 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyMissShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META41]] !continuation [[META48:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !lgc.cps [[META42]] !continuation [[META48:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP4:%.*]] = alloca [4 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [4 x i32] [[PAYLOAD]], ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP3]], ptr [[TMP2]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP6]], ptr [[TMP5]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP6]], ptr [[TMP2]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP9]], ptr [[TMP5]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP12]], ptr [[TMP10]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP12]], ptr [[TMP7]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP18]], ptr [[TMP10]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> , ptr [[TMP13]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP15]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP18]], ptr [[TMP16]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP15]], ptr [[TMP4]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP21]], ptr [[TMP19]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 2 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP21]], ptr [[TMP16]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP24]], ptr [[TMP22]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP24]], ptr [[TMP19]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 3 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP29]], ptr [[TMP22]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP25]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP27:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [33 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP27:%.*]] = load [4 x i32], ptr [[TMP4]], align 4 +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [27 x i32] poison, [4 x i32] [[TMP27]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; @@ -1795,7 +1791,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; POSTPROCESS-LABEL: define void @MyRayGen( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META35:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META36:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -1819,40 +1815,28 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP8:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32 ; POSTPROCESS-NEXT: [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 3 ; POSTPROCESS-NEXT: [[TMP9:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32 -; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP11]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1 -; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2 -; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3 -; POSTPROCESS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4 -; POSTPROCESS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5 -; POSTPROCESS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6 -; POSTPROCESS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP7]], 7 -; POSTPROCESS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP8]], 8 -; POSTPROCESS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP9]], 9 +; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP11]], 0 +; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP7]], 1 +; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP8]], 2 +; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP9]], 3 ; POSTPROCESS-NEXT: [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 4, i32 [[TMP10]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [16 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 4, i32 [[TMP10]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define dso_local void @MyRayGen.resume.0( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META35]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [4 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META36]] { ; POSTPROCESS-NEXT: entryresume.0: ; POSTPROCESS-NEXT: [[TMP19:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP1]], 0 +; POSTPROCESS-NEXT: [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP1]], 0 ; POSTPROCESS-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[TMP19]], align 4 -; POSTPROCESS-NEXT: [[TMP16:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP1]], 2 -; POSTPROCESS-NEXT: [[TMP10:%.*]] = extractvalue [10 x i32] [[TMP16]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 1 -; POSTPROCESS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 2 -; POSTPROCESS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 3 -; POSTPROCESS-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 4 -; POSTPROCESS-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 5 -; POSTPROCESS-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 6 -; POSTPROCESS-NEXT: [[TMP3:%.*]] = extractvalue [10 x i32] [[TMP16]], 7 -; POSTPROCESS-NEXT: [[TMP5:%.*]] = extractvalue [10 x i32] [[TMP16]], 8 -; POSTPROCESS-NEXT: [[TMP7:%.*]] = extractvalue [10 x i32] [[TMP16]], 9 +; POSTPROCESS-NEXT: [[TMP16:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP1]], 2 +; POSTPROCESS-NEXT: [[TMP10:%.*]] = extractvalue [4 x i32] [[TMP16]], 0 +; POSTPROCESS-NEXT: [[TMP3:%.*]] = extractvalue [4 x i32] [[TMP16]], 1 +; POSTPROCESS-NEXT: [[TMP5:%.*]] = extractvalue [4 x i32] [[TMP16]], 2 +; POSTPROCESS-NEXT: [[TMP7:%.*]] = extractvalue [4 x i32] [[TMP16]], 3 ; POSTPROCESS-NEXT: [[TMP2:%.*]] = bitcast i32 [[TMP10]] to float ; POSTPROCESS-NEXT: [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; POSTPROCESS-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float @@ -1861,7 +1845,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTSROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_4_VEC_INSERT]], float [[TMP6]], i32 2 ; POSTPROCESS-NEXT: [[TMP8:%.*]] = bitcast i32 [[TMP7]] to float ; POSTPROCESS-NEXT: [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP8]], i32 3 -; POSTPROCESS-NEXT: [[TMP17:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP1]], 0 +; POSTPROCESS-NEXT: [[TMP17:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP1]], 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP17]], 0 ; POSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; POSTPROCESS-NEXT: [[TMP18:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 @@ -1896,21 +1880,15 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; POSTPROCESS-LABEL: define void @MyClosestHitShader( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; POSTPROCESS-NEXT: [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[TMP0]], 0, 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; POSTPROCESS-NEXT: store <3 x i32> [[DOTFCA_0_0_EXTRACT]], ptr [[DOTFCA_0_0_GEP]], align 4 @@ -1954,37 +1932,25 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP23]], i32 0, i32 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4 ; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP19]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP20]], 7 -; POSTPROCESS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8 -; POSTPROCESS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9 +; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP19]], 0 +; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT1]], i32 [[TMP20]], 1 +; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP21]], 2 +; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP22]], 3 ; POSTPROCESS-NEXT: [[TMP28:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP28]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP28]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define void @MyAnyHitShader( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; POSTPROCESS-NEXT: [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 0, 0, 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 ; POSTPROCESS-NEXT: store <3 x i32> [[DOTFCA_0_0_0_0_EXTRACT]], ptr [[DOTFCA_0_0_0_0_GEP]], align 4 @@ -2178,18 +2144,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_1_3_GEP236:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP236]], align 4 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT]], i32 [[DOTFCA_1_3_LOAD]], 1, 3 -; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP22]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP23]], 7 -; POSTPROCESS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP24]], 8 -; POSTPROCESS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP25]], 9 +; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP22]], 0 +; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT1]], i32 [[TMP23]], 1 +; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP24]], 2 +; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP25]], 3 ; POSTPROCESS-NEXT: [[TMP38:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP38]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP38]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: 32: ; POSTPROCESS-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 @@ -2254,18 +2214,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_1_3_GEP111:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_LOAD112:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP111]], align 4 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_INSERT113:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT110]], i32 [[DOTFCA_1_3_LOAD112]], 1, 3 -; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT61:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP41]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT64:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT61]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT67:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT64]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT70:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT67]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-NEXT: [[DOTFCA_4_INSERT73:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT70]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-NEXT: [[DOTFCA_5_INSERT76:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT73]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-NEXT: [[DOTFCA_6_INSERT79:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT76]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-NEXT: [[DOTFCA_7_INSERT82:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT79]], i32 [[TMP35]], 7 -; POSTPROCESS-NEXT: [[DOTFCA_8_INSERT85:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT82]], i32 [[TMP36]], 8 -; POSTPROCESS-NEXT: [[DOTFCA_9_INSERT88:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT85]], i32 [[TMP37]], 9 +; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT62:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP41]], 0 +; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT65:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT62]], i32 [[TMP35]], 1 +; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT68:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT65]], i32 [[TMP36]], 2 +; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT71:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT68]], i32 [[TMP37]], 3 ; POSTPROCESS-NEXT: [[TMP52:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP52]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT113]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT88]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP52]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT113]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT71]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: 44: ; POSTPROCESS-NEXT: br i1 [[TMP18]], label [[TMP53:%.*]], label [[TMP71:%.*]] @@ -2330,18 +2284,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_1_3_GEP152:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_LOAD153:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP152]], align 4 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_INSERT154:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT151]], i32 [[DOTFCA_1_3_LOAD153]], 1, 3 -; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT91:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP48]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT94:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT91]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT97:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT94]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT100:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT97]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-NEXT: [[DOTFCA_4_INSERT103:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT100]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-NEXT: [[DOTFCA_5_INSERT106:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT103]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-NEXT: [[DOTFCA_6_INSERT109:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT106]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-NEXT: [[DOTFCA_7_INSERT112:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT109]], i32 [[TMP49]], 7 -; POSTPROCESS-NEXT: [[DOTFCA_8_INSERT115:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT112]], i32 [[TMP50]], 8 -; POSTPROCESS-NEXT: [[DOTFCA_9_INSERT118:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT115]], i32 [[TMP51]], 9 +; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT74:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP48]], 0 +; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT77:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT74]], i32 [[TMP49]], 1 +; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT80:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT77]], i32 [[TMP50]], 2 +; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT83:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT80]], i32 [[TMP51]], 3 ; POSTPROCESS-NEXT: [[TMP55:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP55]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT154]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT118]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP55]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT154]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT83]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: 56: ; POSTPROCESS-NEXT: [[TMP63:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 @@ -2402,18 +2350,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_1_3_GEP193:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_LOAD194:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP193]], align 4 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_INSERT195:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT192]], i32 [[DOTFCA_1_3_LOAD194]], 1, 3 -; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT121:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP64]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT124:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT121]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT127:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT124]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT130:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT127]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-NEXT: [[DOTFCA_4_INSERT133:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT130]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-NEXT: [[DOTFCA_5_INSERT136:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT133]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-NEXT: [[DOTFCA_6_INSERT139:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT136]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-NEXT: [[DOTFCA_7_INSERT142:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT139]], i32 [[TMP59]], 7 -; POSTPROCESS-NEXT: [[DOTFCA_8_INSERT145:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT142]], i32 [[TMP60]], 8 -; POSTPROCESS-NEXT: [[DOTFCA_9_INSERT148:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT145]], i32 [[TMP61]], 9 +; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT86:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP64]], 0 +; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT89:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT86]], i32 [[TMP59]], 1 +; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT92:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT89]], i32 [[TMP60]], 2 +; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT95:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT92]], i32 [[TMP61]], 3 ; POSTPROCESS-NEXT: [[TMP65:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP65]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT195]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT148]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP65]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT195]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT95]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: 66: ; POSTPROCESS-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) @@ -2477,23 +2419,17 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_1_3_GEP234:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_LOAD235:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP234]], align 4 ; POSTPROCESS-NEXT: [[DOTFCA_1_3_INSERT236:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT233]], i32 [[DOTFCA_1_3_LOAD235]], 1, 3 -; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT151:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP72]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT154:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT151]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT157:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT154]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT160:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT157]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-NEXT: [[DOTFCA_4_INSERT163:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT160]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-NEXT: [[DOTFCA_5_INSERT166:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT163]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-NEXT: [[DOTFCA_6_INSERT169:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT166]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-NEXT: [[DOTFCA_7_INSERT172:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT169]], i32 [[TMP73]], 7 -; POSTPROCESS-NEXT: [[DOTFCA_8_INSERT175:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT172]], i32 [[TMP69]], 8 -; POSTPROCESS-NEXT: [[DOTFCA_9_INSERT178:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT175]], i32 [[TMP70]], 9 +; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT98:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP72]], 0 +; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT101:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT98]], i32 [[TMP73]], 1 +; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT104:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT101]], i32 [[TMP69]], 2 +; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT107:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT104]], i32 [[TMP70]], 3 ; POSTPROCESS-NEXT: [[TMP80:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP80]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT236]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT178]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP80]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT236]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT107]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define void @MyIntersectionShader( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] !continuation.stacksize [[META42:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] !continuation.stacksize [[META32:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -2608,7 +2544,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; POSTPROCESS-NEXT: [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0) ; POSTPROCESS-NEXT: [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: isEnd.i: ; POSTPROCESS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -2674,7 +2610,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP16]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT351]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT351]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: 18: ; POSTPROCESS-NEXT: [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0 @@ -2724,18 +2660,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP20]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define dso_local void @MyIntersectionShader.resume.0( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META40]] !continuation [[META41]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META41]] !continuation [[META42]] { ; POSTPROCESS-NEXT: entryresume.0: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP2:%.*]] = add i32 [[TMP15]], -8 -; POSTPROCESS-NEXT: [[TMP16:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP1]], 2 +; POSTPROCESS-NEXT: [[TMP16:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP1]], 2 ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 0 ; POSTPROCESS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 1 ; POSTPROCESS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 2 @@ -2766,7 +2702,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 27 ; POSTPROCESS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 28 ; POSTPROCESS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 29 -; POSTPROCESS-NEXT: [[TMP17:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP1]], 0 +; POSTPROCESS-NEXT: [[TMP17:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP1]], 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_0_0_0_EXTRACT16:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 0, 0, 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_1_0_EXTRACT18:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 1, 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_1_1_EXTRACT20:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 1, 1 @@ -2834,7 +2770,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP7]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: 12: ; POSTPROCESS-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21) @@ -2887,12 +2823,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP13]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define void @MyIntersectionShader2( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META42]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META32]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -3007,7 +2943,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; POSTPROCESS-NEXT: [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader2.resume.0) ; POSTPROCESS-NEXT: [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: isEnd.i: ; POSTPROCESS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -3073,7 +3009,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP16]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT351]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT351]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: 18: ; POSTPROCESS-NEXT: [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0 @@ -3123,18 +3059,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP20]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define dso_local void @MyIntersectionShader2.resume.0( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META40]] !continuation [[META43]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META41]] !continuation [[META43]] { ; POSTPROCESS-NEXT: entryresume.0: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP2:%.*]] = add i32 [[TMP15]], -8 -; POSTPROCESS-NEXT: [[TMP16:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP1]], 2 +; POSTPROCESS-NEXT: [[TMP16:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP1]], 2 ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 0 ; POSTPROCESS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 1 ; POSTPROCESS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 2 @@ -3165,7 +3101,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 27 ; POSTPROCESS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 28 ; POSTPROCESS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 29 -; POSTPROCESS-NEXT: [[TMP17:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP1]], 0 +; POSTPROCESS-NEXT: [[TMP17:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP1]], 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_0_0_0_EXTRACT16:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 0, 0, 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_1_0_EXTRACT18:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 1, 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_1_1_EXTRACT20:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 1, 1 @@ -3233,7 +3169,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP7]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) ; POSTPROCESS-NEXT: unreachable ; POSTPROCESS: 12: ; POSTPROCESS-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21) @@ -3286,25 +3222,19 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP13]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define void @MyMissShader( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META44:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; POSTPROCESS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; POSTPROCESS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; POSTPROCESS-NEXT: [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[TMP0]], 0, 0 ; POSTPROCESS-NEXT: [[TMP1:%.*]] = bitcast i32 [[PAYLOAD_FCA_0_EXTRACT]] to float ; POSTPROCESS-NEXT: [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 @@ -3324,18 +3254,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> , i32 3 ; POSTPROCESS-NEXT: [[TMP12:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32 ; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP5]], 0 -; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP6]], 7 -; POSTPROCESS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP7]], 8 -; POSTPROCESS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP12]], 9 +; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP5]], 0 +; POSTPROCESS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT1]], i32 [[TMP6]], 1 +; POSTPROCESS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP7]], 2 +; POSTPROCESS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP12]], 3 ; POSTPROCESS-NEXT: [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP13]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP13]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; POSTPROCESS-NEXT: unreachable ; ; @@ -3419,7 +3343,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyRayGen( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0 ; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) @@ -3441,36 +3365,24 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[TMP9:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32 ; CLEANUP-CPS-NEXT: [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 3 ; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32 -; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP7]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1 -; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4 -; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5 -; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6 -; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP8]], 7 -; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8 -; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 8, {} poison, i64 [[TMP6]], i32 5, [36 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP7]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP8]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP9]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP10]], 3 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 8, {} poison, i64 [[TMP6]], i32 5, [30 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @MyRayGen.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META35]] !continuation [[META36]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [4 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] { ; CLEANUP-CPS-NEXT: entryresume.0: -; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, align 8 -; CLEANUP-CPS-NEXT: store { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], ptr [[TMP4]], align 4 -; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 1 -; CLEANUP-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 4 -; CLEANUP-CPS-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 5 -; CLEANUP-CPS-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 6 -; CLEANUP-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 7 -; CLEANUP-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 8 -; CLEANUP-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 9 +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, align 8 +; CLEANUP-CPS-NEXT: store { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], ptr [[TMP4]], align 4 +; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 3 ; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = bitcast i32 [[DOTFCA_0_EXTRACT]] to float ; CLEANUP-CPS-NEXT: [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0 ; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = bitcast i32 [[DOTFCA_7_EXTRACT]] to float @@ -3479,11 +3391,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTSROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_4_VEC_INSERT]], float [[TMP8]], i32 2 ; CLEANUP-CPS-NEXT: [[TMP9:%.*]] = bitcast i32 [[DOTFCA_9_EXTRACT]] to float ; CLEANUP-CPS-NEXT: [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP9]], i32 3 -; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], 0 +; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP10]], 0 ; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; CLEANUP-CPS-NEXT: [[TMP11:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; CLEANUP-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; CLEANUP-CPS-NEXT: [[RES_1_I1:%.*]] = load i32, ptr [[TMP12]], align 4 ; CLEANUP-CPS-NEXT: [[RESPTR_2_I2:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP12]], i32 0, i32 0, i32 1 ; CLEANUP-CPS-NEXT: [[RES_2_I3:%.*]] = load i32, ptr [[RESPTR_2_I2]], align 4 @@ -3493,7 +3405,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[VAL_1_I7:%.*]] = insertelement <3 x i32> [[VAL_0_I6]], i32 [[RES_2_I3]], i32 1 ; CLEANUP-CPS-NEXT: [[VAL_2_I8:%.*]] = insertelement <3 x i32> [[VAL_1_I7]], i32 [[RES_3_I5]], i32 2 ; CLEANUP-CPS-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[VAL_2_I8]], i8 0 -; CLEANUP-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; CLEANUP-CPS-NEXT: [[RES_1_I:%.*]] = load i32, ptr [[TMP13]], align 4 ; CLEANUP-CPS-NEXT: [[RESPTR_2_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP13]], i32 0, i32 0, i32 1 ; CLEANUP-CPS-NEXT: [[RES_2_I:%.*]] = load i32, ptr [[RESPTR_2_I]], align 4 @@ -3514,19 +3426,13 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyClosestHitShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37:![0-9]+]] !lgc.cps [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0 ; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; CLEANUP-CPS-NEXT: store <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_GEP]], align 4 @@ -3569,34 +3475,22 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP21]], i32 0, i32 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT10:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP17]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP18]], 7 -; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP19]], 8 -; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP20]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP17]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP18]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP19]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP20]], 3 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define void @MyAnyHitShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META34]] !continuation [[META41:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 0, 0 ; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 ; CLEANUP-CPS-NEXT: store <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_0_0_GEP]], align 4 @@ -3789,17 +3683,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP]], align 4 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT]], i32 [[DOTFCA_1_3_LOAD]], 1, 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP19]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP20]], 7 -; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8 -; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP19]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP20]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP21]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP22]], 3 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 28: ; CLEANUP-CPS-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 @@ -3864,17 +3752,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_GEP262:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_LOAD263:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP262]], align 4 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_INSERT264:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT261]], i32 [[DOTFCA_1_3_LOAD263]], 1, 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT62:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP30]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT65:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT62]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT68:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT65]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT71:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT68]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT74:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT71]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT77:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT74]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT80:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT77]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT83:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT80]], i32 [[TMP31]], 7 -; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT86:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT83]], i32 [[TMP32]], 8 -; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT89:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT86]], i32 [[TMP33]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT264]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT89]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT62:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP30]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT65:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT62]], i32 [[TMP31]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT68:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT65]], i32 [[TMP32]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT71:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT68]], i32 [[TMP33]], 3 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT264]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT71]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 39: ; CLEANUP-CPS-NEXT: br i1 [[TMP15]], label [[TMP40:%.*]], label [[TMP59:%.*]] @@ -3939,17 +3821,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_GEP303:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_LOAD304:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP303]], align 4 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_INSERT305:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT302]], i32 [[DOTFCA_1_3_LOAD304]], 1, 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT92:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP43]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT95:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT92]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT98:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT95]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT101:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT98]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT104:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT101]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT107:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT104]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT110:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT107]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT113:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT110]], i32 [[TMP44]], 7 -; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT116:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT113]], i32 [[TMP45]], 8 -; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT119:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT116]], i32 [[TMP46]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT305]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT119]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT74:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP43]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT77:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT74]], i32 [[TMP44]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT80:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT77]], i32 [[TMP45]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT83:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT80]], i32 [[TMP46]], 3 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT305]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT83]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 50: ; CLEANUP-CPS-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 @@ -4010,17 +3886,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_GEP344:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_LOAD345:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP344]], align 4 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_INSERT346:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT343]], i32 [[DOTFCA_1_3_LOAD345]], 1, 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT122:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP52]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT125:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT122]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT128:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT125]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT131:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT128]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT134:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT131]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT137:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT134]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT140:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT137]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT143:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT140]], i32 [[TMP53]], 7 -; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT146:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT143]], i32 [[TMP54]], 8 -; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT149:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT146]], i32 [[TMP55]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT346]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT149]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT86:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP52]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT89:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT86]], i32 [[TMP53]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT92:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT89]], i32 [[TMP54]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT95:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT92]], i32 [[TMP55]], 3 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT346]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT95]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 59: ; CLEANUP-CPS-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) @@ -4084,22 +3954,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_GEP385:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_LOAD386:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP385]], align 4 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_3_INSERT387:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT384]], i32 [[DOTFCA_1_3_LOAD386]], 1, 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT152:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP60]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT155:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT152]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT158:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT155]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT161:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT158]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT164:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT161]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT167:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT164]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT170:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT167]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT173:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT170]], i32 [[TMP61]], 7 -; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT176:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT173]], i32 [[TMP62]], 8 -; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT179:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT176]], i32 [[TMP63]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT387]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT179]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT98:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP60]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT101:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT98]], i32 [[TMP61]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT104:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT101]], i32 [[TMP62]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT107:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT104]], i32 [[TMP63]], 3 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT387]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT107]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) ; CLEANUP-CPS-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 @@ -4208,7 +4072,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader.resume.0) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: isEnd.i: ; CLEANUP-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -4271,7 +4135,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 9: ; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0 @@ -4318,15 +4182,15 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META43]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META43]] { ; CLEANUP-CPS-NEXT: entryresume.0: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8) -; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2 +; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 1 ; CLEANUP-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 2 @@ -4357,7 +4221,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 27 ; CLEANUP-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 29 -; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 0 +; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 0, 0, 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 1, 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 1, 1 @@ -4421,7 +4285,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 8: ; CLEANUP-CPS-NEXT: [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0 @@ -4470,12 +4334,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader2( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) ; CLEANUP-CPS-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER2_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 @@ -4584,7 +4448,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader2.resume.0) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: isEnd.i: ; CLEANUP-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -4647,7 +4511,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 9: ; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0 @@ -4694,15 +4558,15 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader2.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44]] { ; CLEANUP-CPS-NEXT: entryresume.0: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8) -; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2 +; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 1 ; CLEANUP-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 2 @@ -4733,7 +4597,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 27 ; CLEANUP-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 29 -; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 0 +; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 0, 0, 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 1, 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 1, 1 @@ -4797,7 +4661,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 8: ; CLEANUP-CPS-NEXT: [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER2_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0 @@ -4846,23 +4710,17 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define void @MyMissShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META38]] !continuation [[META45:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !lgc.cps [[META39]] !continuation [[META45:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0 ; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = bitcast i32 [[PAYLOAD_FCA_0_EXTRACT]] to float ; CLEANUP-CPS-NEXT: [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 @@ -4882,17 +4740,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> , i32 3 ; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP4]], 0 -; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP5]], 7 -; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP6]], 8 -; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP7]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP4]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP5]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP6]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP7]], 3 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; ; @@ -4976,7 +4828,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; POSTPROCESS-CPS-LABEL: define void @MyRayGen( -; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] { +; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { ; POSTPROCESS-CPS-NEXT: AllocaSpillBB: ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -5000,39 +4852,27 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[TMP9:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 3 ; POSTPROCESS-CPS-NEXT: [[TMP10:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP12]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP11]], 7 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP12]], 0 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP11]], 1 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP9]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP10]], 3 ; POSTPROCESS-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 4, i32 [[TMP13]], i64 [[TMP8]], i32 5, [36 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 4, i32 [[TMP13]], i64 [[TMP8]], i32 5, [30 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; ; ; POSTPROCESS-CPS-LABEL: define dso_local void @MyRayGen.resume.0( -; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META35]] !continuation [[META36]] { +; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [4 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] { ; POSTPROCESS-CPS-NEXT: entryresume.0: -; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, align 8 +; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, align 8 ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: store { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], ptr [[TMP4]], align 4 -; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 1 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 4 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 5 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 6 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 7 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 8 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 9 +; POSTPROCESS-CPS-NEXT: store { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], ptr [[TMP4]], align 4 +; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 0 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 1 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 3 ; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = bitcast i32 [[DOTFCA_0_EXTRACT]] to float ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0 ; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = bitcast i32 [[DOTFCA_7_EXTRACT]] to float @@ -5041,11 +4881,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_4_VEC_INSERT]], float [[TMP8]], i32 2 ; POSTPROCESS-CPS-NEXT: [[TMP9:%.*]] = bitcast i32 [[DOTFCA_9_EXTRACT]] to float ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP9]], i32 3 -; POSTPROCESS-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], 0 +; POSTPROCESS-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP10]], 0 ; POSTPROCESS-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; POSTPROCESS-CPS-NEXT: [[TMP11:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; POSTPROCESS-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0 +; POSTPROCESS-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; POSTPROCESS-CPS-NEXT: [[RES_1_I1:%.*]] = load i32, ptr [[TMP12]], align 4 ; POSTPROCESS-CPS-NEXT: [[RESPTR_2_I2:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP12]], i32 0, i32 0, i32 1 ; POSTPROCESS-CPS-NEXT: [[RES_2_I3:%.*]] = load i32, ptr [[RESPTR_2_I2]], align 4 @@ -5055,7 +4895,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[VAL_1_I7:%.*]] = insertelement <3 x i32> [[VAL_0_I6]], i32 [[RES_2_I3]], i32 1 ; POSTPROCESS-CPS-NEXT: [[VAL_2_I8:%.*]] = insertelement <3 x i32> [[VAL_1_I7]], i32 [[RES_3_I5]], i32 2 ; POSTPROCESS-CPS-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[VAL_2_I8]], i8 0 -; POSTPROCESS-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0 +; POSTPROCESS-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; POSTPROCESS-CPS-NEXT: [[RES_1_I:%.*]] = load i32, ptr [[TMP13]], align 4 ; POSTPROCESS-CPS-NEXT: [[RESPTR_2_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP13]], i32 0, i32 0, i32 1 ; POSTPROCESS-CPS-NEXT: [[RES_2_I:%.*]] = load i32, ptr [[RESPTR_2_I]], align 4 @@ -5076,21 +4916,15 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; POSTPROCESS-CPS-LABEL: define void @MyClosestHitShader( -; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37:![0-9]+]] !lgc.cps [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] { +; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] { ; POSTPROCESS-CPS-NEXT: AllocaSpillBB: ; POSTPROCESS-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; POSTPROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0 ; POSTPROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; POSTPROCESS-CPS-NEXT: store <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_GEP]], align 4 @@ -5133,38 +4967,26 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP21]], i32 0, i32 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT10:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP17]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP18]], 7 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP19]], 8 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP20]], 9 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP17]], 0 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP18]], 1 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP19]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP20]], 3 ; POSTPROCESS-CPS-NEXT: [[TMP24:%.*]] = zext i32 [[RETURNADDR]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP24]], i32 [[TMP25]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP24]], i32 [[TMP25]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; ; ; POSTPROCESS-CPS-LABEL: define void @MyAnyHitShader( -; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] { +; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META34:![0-9]+]] !continuation [[META41:![0-9]+]] { ; POSTPROCESS-CPS-NEXT: AllocaSpillBB: ; POSTPROCESS-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; POSTPROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 0, 0 ; POSTPROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 ; POSTPROCESS-CPS-NEXT: store <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_0_0_GEP]], align 4 @@ -5357,31 +5179,25 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP]], align 4 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT]], i32 [[DOTFCA_1_3_LOAD]], 1, 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP19]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP20]], 7 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP19]], 0 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP20]], 1 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP21]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP22]], 3 ; POSTPROCESS-CPS-NEXT: [[TMP30:%.*]] = zext i32 [[RETURNADDR]] to i64 -; POSTPROCESS-CPS-NEXT: [[TMP31:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP30]], i32 [[TMP31]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; POSTPROCESS-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr [[CSP]], align 4 +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP30]], i32 [[TMP29]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: 30: ; POSTPROCESS-CPS-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; POSTPROCESS-CPS-NEXT: call void @_cont_AcceptHitAndEndSearch(ptr [[TMP33]]) ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT25:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0 -; POSTPROCESS-CPS-NEXT: [[TMP34:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT25]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP36:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT25]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_4_VEC_EXTRACT34:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1 -; POSTPROCESS-CPS-NEXT: [[TMP35:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT34]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP37:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT34]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_8_VEC_EXTRACT42:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2 -; POSTPROCESS-CPS-NEXT: [[TMP36:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT42]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP34:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT42]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_12_VEC_EXTRACT52:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3 -; POSTPROCESS-CPS-NEXT: [[TMP37:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT52]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP35:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT52]] to i32 ; POSTPROCESS-CPS-NEXT: [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0 ; POSTPROCESS-CPS-NEXT: [[TMP38:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32 ; POSTPROCESS-CPS-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP38]] to float @@ -5434,19 +5250,13 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_GEP261:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_LOAD262:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP261]], align 4 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_INSERT263:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT260]], i32 [[DOTFCA_1_3_LOAD262]], 1, 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT61:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP34]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT64:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT61]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT67:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT64]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT70:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT67]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_4_INSERT73:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT70]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_5_INSERT76:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT73]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_6_INSERT79:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT76]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_INSERT82:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT79]], i32 [[TMP35]], 7 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_INSERT85:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT82]], i32 [[TMP36]], 8 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_INSERT88:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT85]], i32 [[TMP37]], 9 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT62:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP36]], 0 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT65:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT62]], i32 [[TMP37]], 1 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT68:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT65]], i32 [[TMP34]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT71:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT68]], i32 [[TMP35]], 3 ; POSTPROCESS-CPS-NEXT: [[TMP45:%.*]] = zext i32 [[RETURNADDR]] to i64 -; POSTPROCESS-CPS-NEXT: [[TMP46:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP45]], i32 [[TMP46]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT263]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT88]]) +; POSTPROCESS-CPS-NEXT: [[TMP43:%.*]] = load i32, ptr [[CSP]], align 4 +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP45]], i32 [[TMP43]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT263]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT71]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: 43: ; POSTPROCESS-CPS-NEXT: br i1 [[TMP15]], label [[TMP48:%.*]], label [[TMP75:%.*]] @@ -5511,31 +5321,25 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_GEP302:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_LOAD303:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP302]], align 4 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_INSERT304:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT301]], i32 [[DOTFCA_1_3_LOAD303]], 1, 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT91:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP51]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT94:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT91]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT97:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT94]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT100:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT97]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_4_INSERT103:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT100]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_5_INSERT106:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT103]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_6_INSERT109:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT106]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_INSERT112:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT109]], i32 [[TMP52]], 7 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_INSERT115:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT112]], i32 [[TMP53]], 8 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_INSERT118:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT115]], i32 [[TMP54]], 9 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT74:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP51]], 0 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT77:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT74]], i32 [[TMP52]], 1 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT80:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT77]], i32 [[TMP53]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT83:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT80]], i32 [[TMP54]], 3 ; POSTPROCESS-CPS-NEXT: [[TMP60:%.*]] = zext i32 [[RETURNADDR]] to i64 -; POSTPROCESS-CPS-NEXT: [[TMP61:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP60]], i32 [[TMP61]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT304]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT118]]) +; POSTPROCESS-CPS-NEXT: [[TMP64:%.*]] = load i32, ptr [[CSP]], align 4 +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP60]], i32 [[TMP64]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT304]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT83]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: 56: ; POSTPROCESS-CPS-NEXT: [[TMP63:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 ; POSTPROCESS-CPS-NEXT: call void @_cont_IgnoreHit(ptr [[TMP63]]) ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT29:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0 -; POSTPROCESS-CPS-NEXT: [[TMP64:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT29]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP58:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT29]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_4_VEC_EXTRACT38:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1 -; POSTPROCESS-CPS-NEXT: [[TMP65:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT38]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP59:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT38]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_8_VEC_EXTRACT46:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2 -; POSTPROCESS-CPS-NEXT: [[TMP66:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT46]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP65:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT46]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_12_VEC_EXTRACT56:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3 -; POSTPROCESS-CPS-NEXT: [[TMP67:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT56]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP61:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT56]] to i32 ; POSTPROCESS-CPS-NEXT: [[TMP68:%.*]] = bitcast i32 [[TMP6]] to float ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0404_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP68]], i32 0 ; POSTPROCESS-CPS-NEXT: [[TMP69:%.*]] = bitcast i32 [[TMP7]] to float @@ -5584,30 +5388,24 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_GEP343:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_LOAD344:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP343]], align 4 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_INSERT345:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT342]], i32 [[DOTFCA_1_3_LOAD344]], 1, 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT121:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP64]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT124:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT121]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT127:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT124]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT130:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT127]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_4_INSERT133:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT130]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_5_INSERT136:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT133]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_6_INSERT139:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT136]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_INSERT142:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT139]], i32 [[TMP65]], 7 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_INSERT145:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT142]], i32 [[TMP66]], 8 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_INSERT148:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT145]], i32 [[TMP67]], 9 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT86:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP58]], 0 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT89:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT86]], i32 [[TMP59]], 1 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT92:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT89]], i32 [[TMP65]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT95:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT92]], i32 [[TMP61]], 3 ; POSTPROCESS-CPS-NEXT: [[TMP73:%.*]] = zext i32 [[RETURNADDR]] to i64 -; POSTPROCESS-CPS-NEXT: [[TMP74:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP73]], i32 [[TMP74]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT345]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT148]]) +; POSTPROCESS-CPS-NEXT: [[TMP66:%.*]] = load i32, ptr [[CSP]], align 4 +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP73]], i32 [[TMP66]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT345]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT95]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: 67: ; POSTPROCESS-CPS-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT31:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0 -; POSTPROCESS-CPS-NEXT: [[TMP76:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT31]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP72:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT31]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_4_VEC_EXTRACT40:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1 -; POSTPROCESS-CPS-NEXT: [[TMP77:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT40]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP74:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT40]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_8_VEC_EXTRACT48:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2 -; POSTPROCESS-CPS-NEXT: [[TMP78:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT48]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP76:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT48]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_12_VEC_EXTRACT58:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3 -; POSTPROCESS-CPS-NEXT: [[TMP79:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT58]] to i32 +; POSTPROCESS-CPS-NEXT: [[TMP71:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT58]] to i32 ; POSTPROCESS-CPS-NEXT: [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0 ; POSTPROCESS-CPS-NEXT: [[TMP80:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13]] to i32 ; POSTPROCESS-CPS-NEXT: [[TMP81:%.*]] = bitcast i32 [[TMP80]] to float @@ -5660,24 +5458,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_GEP384:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_LOAD385:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP384]], align 4 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_3_INSERT386:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT383]], i32 [[DOTFCA_1_3_LOAD385]], 1, 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT151:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP76]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT154:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT151]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT157:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT154]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT160:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT157]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_4_INSERT163:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT160]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_5_INSERT166:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT163]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_6_INSERT169:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT166]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_INSERT172:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT169]], i32 [[TMP77]], 7 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_INSERT175:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT172]], i32 [[TMP78]], 8 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_INSERT178:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT175]], i32 [[TMP79]], 9 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT98:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP72]], 0 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT101:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT98]], i32 [[TMP74]], 1 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT104:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT101]], i32 [[TMP76]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT107:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT104]], i32 [[TMP71]], 3 ; POSTPROCESS-CPS-NEXT: [[TMP87:%.*]] = zext i32 [[RETURNADDR]] to i64 -; POSTPROCESS-CPS-NEXT: [[TMP88:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP87]], i32 [[TMP88]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT386]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT178]]) +; POSTPROCESS-CPS-NEXT: [[TMP78:%.*]] = load i32, ptr [[CSP]], align 4 +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP87]], i32 [[TMP78]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT386]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT107]]) ; POSTPROCESS-CPS-NEXT: unreachable ; ; ; POSTPROCESS-CPS-LABEL: define void @MyIntersectionShader( -; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] { +; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] { ; POSTPROCESS-CPS-NEXT: AllocaSpillBB: ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -5792,7 +5584,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0) ; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: isEnd.i: ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -5859,7 +5651,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: store i32 [[TMP15]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP16:%.*]] = zext i32 [[RETURNADDR]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP16]], i32 [[TMP17]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP16]], i32 [[TMP17]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: 18: ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0 @@ -5910,18 +5702,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: store i32 [[TMP20]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP21:%.*]] = zext i32 [[RETURNADDR]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP22:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP21]], i32 [[TMP22]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP21]], i32 [[TMP22]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; ; ; POSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0( -; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META43]] { +; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META43]] { ; POSTPROCESS-CPS-NEXT: entryresume.0: ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], -8 -; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2 +; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 1 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 2 @@ -5952,7 +5744,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29 -; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 0 +; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 0, 0, 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 1, 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 1, 1 @@ -6021,7 +5813,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: store i32 [[TMP12]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP13:%.*]] = zext i32 [[RETURN_ADDR_RELOAD2]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP13]], i32 [[TMP14]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP13]], i32 [[TMP14]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: 15: ; POSTPROCESS-CPS-NEXT: [[TMP16:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(21) @@ -6075,12 +5867,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: store i32 [[TMP19]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP20:%.*]] = zext i32 [[RETURN_ADDR_RELOAD]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP20]], i32 [[TMP21]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP20]], i32 [[TMP21]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; ; ; POSTPROCESS-CPS-LABEL: define void @MyIntersectionShader2( -; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] { +; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] { ; POSTPROCESS-CPS-NEXT: AllocaSpillBB: ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -6195,7 +5987,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader2.resume.0) ; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: isEnd.i: ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -6262,7 +6054,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: store i32 [[TMP15]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP16:%.*]] = zext i32 [[RETURNADDR]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP16]], i32 [[TMP17]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP16]], i32 [[TMP17]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: 18: ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0 @@ -6313,18 +6105,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: store i32 [[TMP20]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP21:%.*]] = zext i32 [[RETURNADDR]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP22:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP21]], i32 [[TMP22]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP21]], i32 [[TMP22]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; ; ; POSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShader2.resume.0( -; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44]] { +; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44]] { ; POSTPROCESS-CPS-NEXT: entryresume.0: ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], -8 -; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2 +; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 1 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 2 @@ -6355,7 +6147,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29 -; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 0 +; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 0, 0, 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 1, 0 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 1, 1 @@ -6424,7 +6216,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: store i32 [[TMP12]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP13:%.*]] = zext i32 [[RETURN_ADDR_RELOAD2]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP13]], i32 [[TMP14]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP13]], i32 [[TMP14]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]) ; POSTPROCESS-CPS-NEXT: unreachable ; POSTPROCESS-CPS: 15: ; POSTPROCESS-CPS-NEXT: [[TMP16:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(21) @@ -6478,25 +6270,19 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: store i32 [[TMP19]], ptr [[CSP]], align 4 ; POSTPROCESS-CPS-NEXT: [[TMP20:%.*]] = zext i32 [[RETURN_ADDR_RELOAD]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP20]], i32 [[TMP21]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP20]], i32 [[TMP21]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; ; ; POSTPROCESS-CPS-LABEL: define void @MyMissShader( -; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META38]] !continuation [[META45:![0-9]+]] { +; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !lgc.cps [[META39]] !continuation [[META45:![0-9]+]] { ; POSTPROCESS-CPS-NEXT: AllocaSpillBB: ; POSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8 -; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 +; POSTPROCESS-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; POSTPROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0 ; POSTPROCESS-CPS-NEXT: [[TMP0:%.*]] = bitcast i32 [[PAYLOAD_FCA_0_EXTRACT]] to float ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 @@ -6516,18 +6302,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> , i32 3 ; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32 ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP4]], 0 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP5]], 7 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP6]], 8 -; POSTPROCESS-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP7]], 9 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP4]], 0 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP5]], 1 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP6]], 2 +; POSTPROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP7]], 3 ; POSTPROCESS-CPS-NEXT: [[TMP10:%.*]] = zext i32 [[RETURNADDR]] to i64 ; POSTPROCESS-CPS-NEXT: [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP10]], i32 [[TMP11]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; POSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP10]], i32 [[TMP11]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]) ; POSTPROCESS-CPS-NEXT: unreachable ; diff --git a/llvmraytracing/test/dx/payload-save-registers.ll b/llvmraytracing/test/dx/payload-save-registers.ll index 21a0ee7046..1107306d2a 100644 --- a/llvmraytracing/test/dx/payload-save-registers.ll +++ b/llvmraytracing/test/dx/payload-save-registers.ll @@ -31,209 +31,231 @@ declare !pointeetys !48 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, ; Function Attrs: nounwind define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !pointeetys !23 { ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @Miss( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META25:![0-9]+]] !continuation.registercount [[META23:![0-9]+]] !continuation [[META26:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [4 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META26:![0-9]+]] !continuation.registercount [[META24:![0-9]+]] !continuation [[META27:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [37 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_OUTERPAYLOAD:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: store [30 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP36:%.*]] = load ptr addrspace(32), ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP6]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP35]], ptr [[TMP4]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP5]], ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP7]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP38]], ptr [[TMP8]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP9]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP41]], ptr [[TMP10]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP15]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP11]], ptr [[TMP10]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 3 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP11]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP44]], ptr [[TMP12]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP21]], ptr [[TMP12]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP13]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP51]], ptr [[TMP14]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP27]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP17]], ptr [[TMP14]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 5 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 5 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = load i32, ptr [[TMP15]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP67]], ptr [[TMP16]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP19]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP33]], ptr [[TMP16]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 6 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = load i32, ptr [[TMP17]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP68]], ptr [[TMP18]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP36]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP23]], ptr [[TMP18]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP69:%.*]] = load i32, ptr [[TMP19]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP69]], ptr [[TMP20]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = load i32, ptr [[TMP25]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP64]], ptr [[TMP20]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP21]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP31]], ptr [[TMP22]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP66]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP29]], ptr [[TMP22]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 9 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 9 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP70:%.*]] = load i32, ptr [[TMP23]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP70]], ptr [[TMP24]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = load i32, ptr [[TMP31]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP68]], ptr [[TMP24]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 10 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 10 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP25]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP37]], ptr [[TMP26]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP70]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP35]], ptr [[TMP26]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 11 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 11 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP72:%.*]] = load i32, ptr [[TMP27]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP72]], ptr [[TMP28]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP38]], ptr [[TMP28]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 12 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP29]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP43]], ptr [[TMP30]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP72]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP41]], ptr [[TMP30]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 13 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 13 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP74:%.*]] = load i32, ptr [[TMP33]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP74]], ptr [[TMP32]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP44]], ptr [[TMP32]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 14 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 14 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP75:%.*]] = load i32, ptr [[TMP76]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP75]], ptr [[TMP34]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP77:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP74]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP76]], ptr [[TMP34]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP78:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP78]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP81:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP81]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP83:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP83]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP90:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP90]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP56]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP107:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP108:%.*]] = load i32, ptr [[TMP107]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP60]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP82:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP82]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP82]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP85:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP85]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP113:%.*]] = load i32, ptr [[TMP85]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP55:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = load i32, ptr [[TMP55]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP92:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP92]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP69:%.*]] = load i32, ptr [[TMP92]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP141:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = load i32, ptr [[TMP141]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP71:%.*]] = load i32, ptr [[TMP141]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP142:%.*]] = load i32, ptr [[TMP40]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP73:%.*]] = load i32, ptr [[TMP40]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP75:%.*]] = load i32, ptr [[TMP63]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP77:%.*]] = load i32, ptr [[TMP65]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = alloca [[STRUCT_INNERPAYLOAD:%.*]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = bitcast ptr [[TMP46]] to ptr ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP47]]) #[[ATTR0]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 14 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP71:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA27:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP116:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA28:![0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: store float [[TMP71]], ptr [[TMP50]], align 4, !tbaa [[TBAA27]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP73:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP45]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP73]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-NEXT: store float [[TMP116]], ptr [[TMP50]], align 4, !tbaa [[TBAA28]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP84:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP45]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP84]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP53:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP52]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP54]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP78:%.*]] = load i32, ptr [[TMP57]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP78]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP119:%.*]] = load i32, ptr [[TMP57]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP119]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP79:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP80:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP79]]), !continuation.registercount [[META31:![0-9]+]], !continuation.returnedRegistercount [[META31]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP81:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } @await(ptr [[TMP80]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP81]], 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP122:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [4 x i32] poison, [1 x i32] [[TMP79]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP125:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } @await(ptr [[TMP122]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } [[TMP125]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [1 x i32] [[TMP61]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_INNERPAYLOAD]] poison, ptr [[TMP46]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP84:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP84]], ptr [[TMP59]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP58:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP81]], 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP128:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP128]], ptr [[TMP59]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP58:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } [[TMP125]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP58]], ptr [[TMP54]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT:%.*]] ; LOWERRAYTRACINGPIPELINE: .split: -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP50]], align 4, !tbaa [[TBAA27]] -; LOWERRAYTRACINGPIPELINE-NEXT: store float [[TMP86]], ptr [[TMP48]], align 4, !tbaa [[TBAA27]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP97:%.*]] = load float, ptr [[TMP50]], align 4, !tbaa [[TBAA28]] +; LOWERRAYTRACINGPIPELINE-NEXT: store float [[TMP97]], ptr [[TMP48]], align 4, !tbaa [[TBAA28]] ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP47]]) #[[ATTR0]] -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP77]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP98:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP80]], ptr [[TMP98]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP99:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP51]], ptr [[TMP99]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP131:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP86]], ptr [[TMP131]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP101:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP96]], ptr [[TMP101]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP102:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP105]], ptr [[TMP102]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP134:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP108]], ptr [[TMP134]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP104:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP110]], ptr [[TMP104]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP87:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP83]], ptr [[TMP87]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP111]], ptr [[TMP87]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP88:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP90]], ptr [[TMP88]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP113]], ptr [[TMP88]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP89:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP56]], ptr [[TMP89]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP67]], ptr [[TMP89]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP39:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP96]], ptr [[TMP39]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP69]], ptr [[TMP39]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP91:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP60]], ptr [[TMP91]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP71]], ptr [[TMP91]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP142]], ptr [[TMP42]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP73]], ptr [[TMP42]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP93:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP64]], ptr [[TMP93]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP75]], ptr [[TMP93]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP66]], ptr [[TMP49]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP144:%.*]] = load ptr addrspace(32), ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP77]], ptr [[TMP49]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP62]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP98]], ptr [[TMP97]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP99:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 1 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP114:%.*]] = load i32, ptr [[TMP62]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP114]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP101]], ptr [[TMP99]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP102:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP117:%.*]] = load i32, ptr [[TMP100]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP117]], ptr [[TMP137]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP103]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP104]], ptr [[TMP102]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 3 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP120:%.*]] = load i32, ptr [[TMP103]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP120]], ptr [[TMP140]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP146:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP106:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 3 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP106]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP107]], ptr [[TMP105]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP108:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP123:%.*]] = load i32, ptr [[TMP106]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP123]], ptr [[TMP146]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP149:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP109:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP109]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP110]], ptr [[TMP108]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 5 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP109]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP126]], ptr [[TMP149]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP152:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 5 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP113:%.*]] = load i32, ptr [[TMP112]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP113]], ptr [[TMP111]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 6 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP129:%.*]] = load i32, ptr [[TMP112]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP129]], ptr [[TMP152]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP155:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 6 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP116]], ptr [[TMP114]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 7 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP132:%.*]] = load i32, ptr [[TMP115]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP132]], ptr [[TMP155]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP157:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP118:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP119:%.*]] = load i32, ptr [[TMP118]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP119]], ptr [[TMP117]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 8 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP135:%.*]] = load i32, ptr [[TMP118]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP135]], ptr [[TMP157]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP158:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP121:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 8 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP122:%.*]] = load i32, ptr [[TMP121]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP122]], ptr [[TMP120]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP123:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 9 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP138:%.*]] = load i32, ptr [[TMP121]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP138]], ptr [[TMP158]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP159:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP124:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 9 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP125:%.*]] = load i32, ptr [[TMP124]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP125]], ptr [[TMP123]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP126:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 10 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP160:%.*]] = load i32, ptr [[TMP124]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP160]], ptr [[TMP159]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP142:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP127:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 10 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP128:%.*]] = load i32, ptr [[TMP127]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP128]], ptr [[TMP126]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP129:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 11 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP144:%.*]] = load i32, ptr [[TMP127]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP144]], ptr [[TMP142]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP145:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 11 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP131]], ptr [[TMP129]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP132:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 12 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP147:%.*]] = load i32, ptr [[TMP130]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP147]], ptr [[TMP145]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP148:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP133:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 12 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP134:%.*]] = load i32, ptr [[TMP133]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP134]], ptr [[TMP132]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 13 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP150:%.*]] = load i32, ptr [[TMP133]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP150]], ptr [[TMP148]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP151:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP136:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 13 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP137:%.*]] = load i32, ptr [[TMP136]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP137]], ptr [[TMP135]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 14 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP153:%.*]] = load i32, ptr [[TMP136]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP153]], ptr [[TMP151]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP154:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP139:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 14 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP140:%.*]] = load i32, ptr [[TMP139]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP140]], ptr [[TMP138]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP156:%.*]] = load i32, ptr [[TMP139]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP156]], ptr [[TMP154]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP95:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP94]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP143:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], [10 x i32] poison, [30 x i32] [[TMP143]]), !continuation.registercount [[META23]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], [4 x i32] poison, [30 x i32] [[TMP143]]), !continuation.registercount [[META24]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4 @@ -256,7 +278,7 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !poi ; Function Attrs: nounwind define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeetys !23 { ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @Callable( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation.registercount [[META23]] !continuation [[META33:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [4 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0]] !lgc.rt.shaderstage [[META33:![0-9]+]] !continuation.registercount [[META24]] !continuation [[META34:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_OUTERPAYLOAD:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4 @@ -624,9 +646,9 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP488:%.*]] = load i32, ptr [[TMP270]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP488]], ptr [[TMP269]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP272:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP489:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [11 x i32] poison, [30 x i32] [[TMP272]]), !continuation.registercount [[META23]], !continuation.returnedRegistercount [[META23]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP274:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [30 x i32] } @await.1(ptr [[TMP489]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP490:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [30 x i32] } [[TMP274]], 2 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP489:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [4 x i32] poison, [30 x i32] [[TMP272]]), !continuation.registercount [[META24]], !continuation.returnedRegistercount [[META24]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP274:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } @await.1(ptr [[TMP489]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP490:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } [[TMP274]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [30 x i32] [[TMP490]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_OUTERPAYLOAD]] poison, ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP224:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0 @@ -748,7 +770,7 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP363:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP364:%.*]] = load i32, ptr [[TMP363]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP364]], ptr [[TMP275]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP223:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [30 x i32] } [[TMP274]], 0 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP223:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } [[TMP274]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP223]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT:%.*]] @@ -934,7 +956,7 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP484]], ptr [[TMP482]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP382:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP486:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], [10 x i32] poison, [30 x i32] [[TMP486]]), !continuation.registercount [[META23]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], [4 x i32] poison, [30 x i32] [[TMP486]]), !continuation.registercount [[META24]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; %1 = alloca %struct.OuterPayload, align 8 @@ -1138,10 +1160,7 @@ declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.type declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #3 ; Function Attrs: alwaysinline -declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #3 - -; Function Attrs: alwaysinline -declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #3 +declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) #3 ; Function Attrs: alwaysinline declare !pointeetys !32 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #3 @@ -1184,7 +1203,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i ; Function Attrs: alwaysinline define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #3 !pointeetys !45 { %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4 - %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data) + %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data) store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4 call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) ret void @@ -1210,6 +1229,7 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: re !dx.typeAnnotations = !{!7} !dx.dxrPayloadAnnotations = !{!12} !dx.entryPoints = !{!17, !18, !21} +!lgc.rt.max.attribute.size = !{!49} !0 = !{!"dxcoob 2019.05.00"} !1 = !{i32 1, i32 7} @@ -1260,3 +1280,4 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: re !46 = !{i8 poison} !47 = !{i32 0, i8 poison} !48 = !{%struct.AnyHitTraversalData poison} +!49 = !{i32 8} diff --git a/llvmraytracing/test/dx/remat-intrinsic.ll b/llvmraytracing/test/dx/remat-intrinsic.ll index e51fe74b9a..5a538b179f 100644 --- a/llvmraytracing/test/dx/remat-intrinsic.ll +++ b/llvmraytracing/test/dx/remat-intrinsic.ll @@ -23,7 +23,7 @@ declare i32 @_cont_GetContinuationStackAddr() declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) -declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) +declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) declare !pointeetys !14 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) @@ -39,7 +39,7 @@ define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeet define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) !pointeetys !20 { %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4 - %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data) + %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data) store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4 call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) ret void @@ -93,6 +93,7 @@ attributes #1 = { nounwind } !dx.shaderModel = !{!2} !dx.entryPoints = !{!3, !6} !continuation.maxPayloadRegisterCount = !{!13} +!lgc.rt.max.attribute.size = !{!26} !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} !1 = !{i32 1, i32 6} @@ -120,13 +121,14 @@ attributes #1 = { nounwind } !23 = !{%struct.MyParams poison} !24 = !{i32 0, %struct.TraversalData poison} !25 = !{%struct.TraversalData poison} +!26 = !{i32 8} ; POSTPROCESS-LABEL: define i32 @_cont_GetLocalRootIndex( ; POSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR1:[0-9]+]] { ; POSTPROCESS-NEXT: ret i32 5 ; ; ; POSTPROCESS-LABEL: define void @called( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META16:![0-9]+]] !lgc.rt.shaderstage [[META17:![0-9]+]] !continuation.stacksize [[META18:![0-9]+]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META17:![0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation.stacksize [[META14:![0-9]+]] { ; POSTPROCESS-NEXT: AllocaSpillBB: ; POSTPROCESS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 @@ -151,23 +153,23 @@ attributes #1 = { nounwind } ; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; POSTPROCESS-NEXT: [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0) ; POSTPROCESS-NEXT: [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP7]], i64 [[TMP8]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]) ; POSTPROCESS-NEXT: unreachable ; ; ; POSTPROCESS-LABEL: define dso_local void @called.resume.0( -; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP1:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] { +; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [2 x i32], [1 x i32] } [[TMP1:%.*]]) !continuation [[META17]] !lgc.rt.shaderstage [[META18]] { ; POSTPROCESS-NEXT: entryresume.0: ; POSTPROCESS-NEXT: [[TMP16:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; POSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; POSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 0 +; POSTPROCESS-NEXT: [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0 ; POSTPROCESS-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP3]], ptr [[TMP16]], align 4 ; POSTPROCESS-NEXT: [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP2:%.*]] = add i32 [[TMP13]], -8 -; POSTPROCESS-NEXT: [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 2 +; POSTPROCESS-NEXT: [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 2 ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP4]], 0 -; POSTPROCESS-NEXT: [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 0 +; POSTPROCESS-NEXT: [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP15]], 0 ; POSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; POSTPROCESS-NEXT: [[TMP5:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21) @@ -200,6 +202,6 @@ attributes #1 = { nounwind } ; POSTPROCESS-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8 ; POSTPROCESS-NEXT: store i32 [[TMP11]], ptr [[CSP]], align 4 ; POSTPROCESS-NEXT: [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4 -; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP12]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [8 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]) +; POSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP12]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]) ; POSTPROCESS-NEXT: unreachable ; diff --git a/llvmraytracing/test/dx/traceray.ll b/llvmraytracing/test/dx/traceray.ll index eac4b2f7d3..b6b73db5bb 100644 --- a/llvmraytracing/test/dx/traceray.ll +++ b/llvmraytracing/test/dx/traceray.ll @@ -34,7 +34,7 @@ declare %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i64, i64, %struct.Tra declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0 -declare %struct.TraversalData @_AmdAwaitAnyHit(i64, %struct.TraversalData, float, i32) #0 +declare %struct.TraversalData @_AmdAwaitAnyHit(i64, i64, %struct.TraversalData) #0 declare void @lgc.cps.jump(...) #0 @@ -122,7 +122,7 @@ define i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind) anyhit: ; preds = %0 %trav_data = load %struct.TraversalData, %struct.TraversalData* %data, align 4 - %newdata = call %struct.TraversalData @_AmdAwaitAnyHit(i64 3, %struct.TraversalData %trav_data, float %t, i32 %hitKind) + %newdata = call %struct.TraversalData @_AmdAwaitAnyHit(i64 3, i64 poison, %struct.TraversalData %trav_data) store %struct.TraversalData %newdata, %struct.TraversalData* %data, align 4 call void @_AmdRestoreSystemDataAnyHit(%struct.TraversalData* %data) ret i1 true @@ -319,6 +319,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re !dx.entryPoints = !{!18, !20, !23, !25, !27, !29, !31} !lgc.cps.module = !{} !continuation.stackAddrspace = !{!70} ; SKIP_GLOBAL_ADDRSPACE +!lgc.rt.max.attribute.size = !{!71} !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} !1 = !{i32 1, i32 6} @@ -391,6 +392,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re !68 = !{i32 4} !69 = !{i32 7} !70 = !{i32 22} +!71 = !{i32 32} ; Intentionally allow more than the max used (7) so we can test that the actually used size is used. ; LOWERRAYTRACINGPIPELINE-LABEL: define i1 @_cont_IsEndSearch( ; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { @@ -418,7 +420,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define void @_cont_KernelEntry( -; LOWERRAYTRACINGPIPELINE-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_AmdContStackSetPtr(i32 [[CSPINIT]]) ; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison), !continuation.registercount [[META22]] @@ -440,7 +442,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META36:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 @@ -451,7 +453,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA38:![0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) @@ -476,7 +478,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP17]], ptr [[TMP18]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP41]]), !continuation.registercount [[META33:![0-9]+]], !waitmask [[META40:![0-9]+]], !continuation.returnedRegistercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP41]]), !continuation.registercount [[META34:![0-9]+]], !waitmask [[META41:![0-9]+]], !continuation.returnedRegistercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } @await(ptr [[TMP42]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP43]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [10 x i32] [[TMP24]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 @@ -501,7 +503,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT:%.*]] ; LOWERRAYTRACINGPIPELINE: .split: -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA38]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() @@ -519,7 +521,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyClosestHitShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META42:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META43:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 @@ -583,12 +585,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP44]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]], [21 x i32] poison, [10 x i32] [[TMP47]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]], [21 x i32] poison, [10 x i32] [[TMP47]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyAnyHitShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META44:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META45:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -685,7 +687,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP58]], ptr [[ADDR_I1]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP60]], [8 x i32] poison, [10 x i32] [[TMP68]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP60]], [8 x i32] poison, [10 x i32] [[TMP68]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 59: ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> [[TMP25]], ptr [[TMP24]], align 4 @@ -717,12 +719,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP79]], ptr [[ADDR_I2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP81:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP78:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP81]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP81]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META46:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation.registercount [[META33:![0-9]+]] !continuation [[META47:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4 @@ -745,7 +747,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TRAV_DATA_I:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], [20 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], [6 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } @await.1(ptr [[TMP13]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP19]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [30 x i32] [[TMP25]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 @@ -773,18 +775,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE: 22: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 25: ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP23]], [8 x i32] poison, [30 x i32] [[TMP27]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP23]], [8 x i32] poison, [30 x i32] [[TMP27]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShaderLargeAttrs( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45]] !continuation.registercount [[META32]] !continuation [[META47:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46]] !continuation.registercount [[META33]] !continuation [[META48:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4 @@ -821,7 +823,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TRAV_DATA_I:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = load [[STRUCT_LARGEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP8]], [15 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP8]], [1 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP34:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } @await.2(ptr [[TMP13]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP34]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [30 x i32] [[TMP35]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 @@ -868,18 +870,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE: 36: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP31]], [8 x i32] poison, [30 x i32] [[TMP38]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP31]], [8 x i32] poison, [30 x i32] [[TMP38]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 39: ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP33]], [8 x i32] poison, [30 x i32] [[TMP41]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP33]], [8 x i32] poison, [30 x i32] [[TMP41]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyMissShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META48:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META49:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META49:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META50:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 @@ -922,7 +924,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP27]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], [21 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], [21 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -952,7 +954,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define void @_cont_KernelEntry( -; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -976,7 +978,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define void @MyRayGen( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META38:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -1016,7 +1018,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyRayGen.resume.0( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META37]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META38]] { ; DXILCONTPOSTPROCESS-NEXT: entryresume.0: ; DXILCONTPOSTPROCESS-NEXT: [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 @@ -1063,7 +1065,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define void @MyClosestHitShader( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -1131,7 +1133,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define void @MyAnyHitShader( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; DXILCONTPOSTPROCESS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -1365,7 +1367,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define void @MyIntersectionShader( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META44:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META45:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -1462,7 +1464,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; DXILCONTPOSTPROCESS-NEXT: [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0) ; DXILCONTPOSTPROCESS-NEXT: [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) +; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) ; DXILCONTPOSTPROCESS-NEXT: unreachable ; DXILCONTPOSTPROCESS: accepthit.i: ; DXILCONTPOSTPROCESS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -1570,7 +1572,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyIntersectionShader.resume.0( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META43]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META44]] { ; DXILCONTPOSTPROCESS-NEXT: entryresume.0: ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -1718,7 +1720,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define void @MyIntersectionShaderLargeAttrs( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META44]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !continuation [[META46:![0-9]+]] !continuation.stacksize [[META45]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -1821,7 +1823,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; DXILCONTPOSTPROCESS-NEXT: [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShaderLargeAttrs.resume.0) ; DXILCONTPOSTPROCESS-NEXT: [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) +; DXILCONTPOSTPROCESS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) ; DXILCONTPOSTPROCESS-NEXT: unreachable ; DXILCONTPOSTPROCESS: accepthit.i: ; DXILCONTPOSTPROCESS-NEXT: [[TMP7:%.*]] = bitcast i32 100 to float @@ -1925,7 +1927,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META45]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META46]] { ; DXILCONTPOSTPROCESS-NEXT: entryresume.0: ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -2073,7 +2075,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define void @MyMissShader( -; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -2148,7 +2150,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @_cont_KernelEntry( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase() ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22) @@ -2174,7 +2176,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyRayGen( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META38:![0-9]+]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -2216,7 +2218,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyRayGen.resume.0( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META37]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META38]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: entryresume.0: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 @@ -2265,7 +2267,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyClosestHitShader( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -2335,7 +2337,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyAnyHitShader( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -2571,7 +2573,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyIntersectionShader( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META44:![0-9]+]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META45:![0-9]+]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -2669,7 +2671,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0) ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-GLOBAL-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) +; DXILCONTPOSTPROCESS-GLOBAL-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: unreachable ; DXILCONTPOSTPROCESS-GLOBAL: accepthit.i: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -2777,7 +2779,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyIntersectionShader.resume.0( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META43]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META44]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: entryresume.0: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -2925,7 +2927,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyIntersectionShaderLargeAttrs( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META44]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !continuation [[META46:![0-9]+]] !continuation.stacksize [[META45]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -3029,7 +3031,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShaderLargeAttrs.resume.0) ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-GLOBAL-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) +; DXILCONTPOSTPROCESS-GLOBAL-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: unreachable ; DXILCONTPOSTPROCESS-GLOBAL: accepthit.i: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP8:%.*]] = bitcast i32 100 to float @@ -3133,7 +3135,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META45]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META46]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: entryresume.0: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -3281,7 +3283,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyMissShader( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -3358,7 +3360,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @_cont_KernelEntry( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META36:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @_AmdContStackSetPtr(i32 [[CSPINIT]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison) @@ -3380,7 +3382,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyRayGen( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 @@ -3391,7 +3393,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA38:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA39:![0-9]+]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) @@ -3416,7 +3418,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP20]], ptr [[TMP18]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 8, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP21]]), !waitmask [[META41:![0-9]+]], !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 8, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP21]]), !waitmask [[META42:![0-9]+]], !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP22]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [10 x i32] [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4 @@ -3440,7 +3442,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br label [[DOTSPLIT:%.*]] ; LOWERRAYTRACINGPIPELINE-CPS: .split: -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA38]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA39]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP37:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() @@ -3458,7 +3460,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyClosestHitShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 @@ -3521,12 +3523,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP41:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP40]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP42:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP41]], [21 x i32] poison, [10 x i32] [[TMP42]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP41]], [21 x i32] poison, [10 x i32] [[TMP42]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyAnyHitShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44]] !lgc.cps [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -3622,7 +3624,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP52]], ptr [[ADDR_I1]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP54:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP55:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP54]], [8 x i32] poison, [10 x i32] [[TMP55]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP54]], [8 x i32] poison, [10 x i32] [[TMP55]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE-CPS: 56: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> [[TMP26]], ptr [[TMP25]], align 4 @@ -3654,12 +3656,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP72]], ptr [[ADDR_I2]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP74:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP75:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP74]], [8 x i32] poison, [10 x i32] [[TMP75]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP74]], [8 x i32] poison, [10 x i32] [[TMP75]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META48:![0-9]+]] !continuation [[META49:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4 @@ -3682,7 +3684,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [20 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [6 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0 @@ -3709,18 +3711,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS: 20: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP21:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP22]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP22]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE-CPS: 23: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP24:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP24]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP24]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META47]] !continuation [[META49:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META48]] !continuation [[META50:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4 @@ -3757,7 +3759,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = load [[STRUCT_LARGEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP7]], [15 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP7]], [1 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0 @@ -3803,18 +3805,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS: 34: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP35:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP36:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP35]], [8 x i32] poison, [30 x i32] [[TMP36]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP35]], [8 x i32] poison, [30 x i32] [[TMP36]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE-CPS: 37: ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP38:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP39:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP38]], [8 x i32] poison, [30 x i32] [[TMP39]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP38]], [8 x i32] poison, [30 x i32] [[TMP39]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyMissShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45]] !lgc.cps [[META43]] !continuation [[META50:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46]] !lgc.cps [[META44]] !continuation [[META51:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 @@ -3856,7 +3858,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP25]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP27:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [21 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [21 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META34]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; @@ -3886,7 +3888,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @_cont_KernelEntry( -; CLEANUP-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] { +; CLEANUP-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META36:![0-9]+]] { ; CLEANUP-CPS-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; CLEANUP-CPS-NEXT: call void @_AmdContStackSetPtr(i32 [[CSPINIT]]) ; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison) @@ -3908,7 +3910,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyRayGen( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0 ; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) @@ -3940,12 +3942,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP8]], 7 ; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8 ; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !waitmask [[META38:![0-9]+]], !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !waitmask [[META39:![0-9]+]], !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @MyRayGen.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META37]] !continuation [[META38]] { ; CLEANUP-CPS-NEXT: entryresume.0: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, align 8 ; CLEANUP-CPS-NEXT: store { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP3]], ptr [[TMP4]], align 4 @@ -3989,7 +3991,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyClosestHitShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 ; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 @@ -4049,12 +4051,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP17]], 7 ; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP18]], 8 ; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP19]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define void @MyAnyHitShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -4218,7 +4220,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP20]], 7 ; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8 ; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT73]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT73]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 28: ; CLEANUP-CPS-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) @@ -4278,12 +4280,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT49:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT46]], i32 [[TMP30]], 7 ; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT52:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT49]], i32 [[TMP31]], 8 ; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT55:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT52]], i32 [[TMP32]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT99]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT55]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT99]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT55]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) ; CLEANUP-CPS-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 @@ -4374,7 +4376,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader.resume.0) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT327]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT327]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: accepthit.i: ; CLEANUP-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -4429,7 +4431,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 6: ; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_EXTRACT]], 0, 0, 0 @@ -4471,12 +4473,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44]] !continuation [[META45]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45]] !continuation [[META46]] { ; CLEANUP-CPS-NEXT: entryresume.0: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8) ; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2 @@ -4564,7 +4566,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 8: ; CLEANUP-CPS-NEXT: [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0 @@ -4608,12 +4610,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44]] !continuation [[META46:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45]] !continuation [[META47:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) ; CLEANUP-CPS-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADERLARGEATTRS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 @@ -4710,7 +4712,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShaderLargeAttrs.resume.0) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: accepthit.i: ; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = bitcast i32 100 to float @@ -4761,7 +4763,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 4: ; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_EXTRACT]], 0, 0, 0 @@ -4803,12 +4805,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44]] !continuation [[META46]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45]] !continuation [[META47]] { ; CLEANUP-CPS-NEXT: entryresume.0: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8) ; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2 @@ -4896,7 +4898,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD6]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD6]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: 8: ; CLEANUP-CPS-NEXT: [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADERLARGEATTRS_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0 @@ -4940,12 +4942,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define void @MyMissShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META40]] !continuation [[META47:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META41]] !continuation [[META48:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 ; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 @@ -4987,7 +4989,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP5]], 7 ; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP6]], 8 ; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP7]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]] ; CLEANUP-CPS-NEXT: unreachable ; ; @@ -5017,7 +5019,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @_cont_KernelEntry( -; DXILCONTPOSTPROCESS-CPS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META35:![0-9]+]] { +; DXILCONTPOSTPROCESS-CPS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; DXILCONTPOSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -5041,7 +5043,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyRayGen( -; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { +; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -5081,7 +5083,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyRayGen.resume.0( -; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] { +; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META37]] !continuation [[META38]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: entryresume.0: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, align 8 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 @@ -5127,7 +5129,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyClosestHitShader( -; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] { +; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -5196,7 +5198,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyAnyHitShader( -; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] { +; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP0:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -5431,7 +5433,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyIntersectionShader( -; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] { +; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -5528,7 +5530,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-CPS-NEXT: [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0) ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT326]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) +; DXILCONTPOSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT326]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) ; DXILCONTPOSTPROCESS-CPS-NEXT: unreachable ; DXILCONTPOSTPROCESS-CPS: accepthit.i: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -5638,7 +5640,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0( -; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META44]] { +; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META45]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: entryresume.0: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -5788,7 +5790,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs( -; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META45:![0-9]+]] { +; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META46:![0-9]+]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -5891,7 +5893,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-CPS-NEXT: [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShaderLargeAttrs.resume.0) ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) +; DXILCONTPOSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]) ; DXILCONTPOSTPROCESS-CPS-NEXT: unreachable ; DXILCONTPOSTPROCESS-CPS: accepthit.i: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = bitcast i32 100 to float @@ -5997,7 +5999,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0( -; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META45]] { +; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META46]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: entryresume.0: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -6147,7 +6149,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyMissShader( -; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META39]] !continuation [[META46:![0-9]+]] { +; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META40]] !continuation [[META47:![0-9]+]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: AllocaSpillBB: ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 diff --git a/llvmraytracing/test/dx/traversal-empty-payload.ll b/llvmraytracing/test/dx/traversal-empty-payload.ll index 2e3a304308..86118f8d91 100644 --- a/llvmraytracing/test/dx/traversal-empty-payload.ll +++ b/llvmraytracing/test/dx/traversal-empty-payload.ll @@ -10,7 +10,7 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16: %struct.SystemData = type { %struct.DispatchSystemData, float } %struct.DispatchSystemData = type { i32 } -!continuation.preservedPayloadRegisterCount = !{!8} ; EMPTY_PAYLOAD +!continuation.maxUsedPayloadRegisterCount = !{!8} ; EMPTY_PAYLOAD declare !pointeetys !4 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) diff --git a/llvmraytracing/test/dx/traversal-passthrough-payload.ll b/llvmraytracing/test/dx/traversal-passthrough-payload.ll index 6d75c1ba92..9224962e8c 100644 --- a/llvmraytracing/test/dx/traversal-passthrough-payload.ll +++ b/llvmraytracing/test/dx/traversal-passthrough-payload.ll @@ -10,7 +10,7 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16: %struct.SystemData = type { %struct.DispatchSystemData, float } %struct.DispatchSystemData = type { i32 } -!continuation.preservedPayloadRegisterCount = !{!8} ; PRESERVED_REGCOUNT +!continuation.maxUsedPayloadRegisterCount = !{!8} ; PRESERVED_REGCOUNT declare !pointeetys !4 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) diff --git a/llvmraytracing/test/lgccps/entry-point-with-cps.ll b/llvmraytracing/test/lgccps/entry-point-with-cps.ll index 02f18af2e8..c974db97f7 100644 --- a/llvmraytracing/test/lgccps/entry-point-with-cps.ll +++ b/llvmraytracing/test/lgccps/entry-point-with-cps.ll @@ -8,6 +8,11 @@ declare void @lgc.cps.complete() +define void @_cont_KernelEntry() #0 !lgc.rt.shaderstage !{i32 7} { + call void @lgc.cps.complete() + unreachable +} + define spir_func void @raygen({} %state, i32 %rcr) !lgc.shaderstage !{i32 7} !lgc.cps !{i32 0} { %pushconst = call ptr addrspace(4) @lgc.user.data(i32 0) %fn = load ptr, ptr addrspace(4) %pushconst @@ -67,6 +72,9 @@ declare void @lgc.cps.await__isVoid(...) declare i32 @lgc.cps.await__i32(...) declare [2 x i32] @lgc.cps.await__a2i32(...) declare void @lgc.cps.jump(...) +; CHECK-LABEL: define void @_cont_KernelEntry( +; CHECK-NEXT: ret void + ; CHECK-LABEL: define spir_func void @raygen( ; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]]) !lgc.shaderstage [[META0:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] { ; CHECK-NEXT: AllocaSpillBB: diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll index c0b0673fdb..279620cd4c 100644 --- a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll +++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll @@ -38,7 +38,7 @@ declare void @lgc.cps.jump(...) local_unnamed_addr !lgc.cps.module = !{} !continuation.maxPayloadRegisterCount = !{!11} -!continuation.preservedPayloadRegisterCount = !{!12} +!continuation.maxUsedPayloadRegisterCount = !{!12} !0 = !{i32 7} !1 = !{ { { i32 } } poison} diff --git a/llvmraytracing/test/lgccps/lower-traversal.ll b/llvmraytracing/test/lgccps/lower-traversal.ll index 0d6602dadf..cef8cab9d4 100644 --- a/llvmraytracing/test/lgccps/lower-traversal.ll +++ b/llvmraytracing/test/lgccps/lower-traversal.ll @@ -609,7 +609,7 @@ declare void @lgc.cps.jump(...) local_unnamed_addr declare ptr addrspace(7) @lgc.load.buffer.desc(i64 %0, i32 %1, i32 %2, i32 %3) local_unnamed_addr declare ptr @llvm.invariant.start.p7(i64 immarg %0, ptr addrspace(7) nocapture %1) -!continuation.preservedPayloadRegisterCount = !{!7} +!continuation.maxUsedPayloadRegisterCount = !{!7} !lgc.cps.module = !{} !lgc.rt.max.attribute.size = !{!4} diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp index 2a1ca060d5..19d92ea98e 100644 --- a/tool/dumper/vkgcPipelineDumper.cpp +++ b/tool/dumper/vkgcPipelineDumper.cpp @@ -712,6 +712,7 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo dumpFile << "options.forwardPropagateNoContract = " << shaderInfo->options.forwardPropagateNoContract << "\n"; dumpFile << "options.constantBufferBindingOffset = " << shaderInfo->options.constantBufferBindingOffset << "\n"; dumpFile << "options.imageSampleDrefReturnsRgba = " << shaderInfo->options.imageSampleDrefReturnsRgba << "\n"; + dumpFile << "options.disableGlPositionOpt = " << shaderInfo->options.disableGlPositionOpt << "\n"; dumpFile << "\n"; // clang-format on } @@ -960,6 +961,30 @@ void PipelineDumper::dumpPipelineOptions(const PipelineOptions *options, std::os dumpFile << glStatePrefix << "enableLineSmooth = " << options->getGlState().enableLineSmooth << "\n"; dumpFile << glStatePrefix << "emulateWideLineStipple = " << options->getGlState().emulateWideLineStipple << "\n"; dumpFile << glStatePrefix << "enablePointSmooth = " << options->getGlState().enablePointSmooth << "\n"; + + // Output compile time constant info + if (options->compileConstInfo) { + auto compileConstInfo = options->compileConstInfo; + dumpFile << "options.compileTimeConstants.numCompileTimeConstants = " << compileConstInfo->numCompileTimeConstants + << "\n"; + for (unsigned i = 0; i < compileConstInfo->numCompileTimeConstants; ++i) { + dumpFile << "options.compileTimeConstants.constItem[" << i + << "].offset = " << compileConstInfo->pCompileTimeConstants[i].offset << "\n"; + dumpFile << "options.compileTimeConstants.constItem[" << i + << "].set = " << compileConstInfo->pCompileTimeConstants[i].set << "\n"; + dumpFile << "options.compileTimeConstants.constItem[" << i + << "].binding = " << compileConstInfo->pCompileTimeConstants[i].binding << "\n"; + dumpFile << "options.compileTimeConstants.constItem[" << i + << "].validBytes = " << compileConstInfo->pCompileTimeConstants[i].validBytes << "\n"; + dumpFile << "options.compileTimeConstants.constItem[" << i << "].values = "; + for (unsigned j = 0; j < compileConstInfo->pCompileTimeConstants[i].validBytes; ++j) { + dumpFile << compileConstInfo->pCompileTimeConstants[i].values.u32[j] << ""; + if (j < compileConstInfo->pCompileTimeConstants[i].validBytes - 1) + dumpFile << ", "; + } + dumpFile << "\n"; + } + } } // ===================================================================================================================== @@ -1060,6 +1085,7 @@ void PipelineDumper::dumpGraphicsStateInfo(const GraphicsPipelineBuildInfo *pipe dumpFile << "enableColorClampFs = " << pipelineInfo->glState.enableColorClampFs << "\n"; dumpFile << "enableFlatShade = " << pipelineInfo->glState.enableFlatShade << "\n"; dumpFile << "alphaTestFunc = " << pipelineInfo->glState.alphaTestFunc << "\n"; + dumpFile << "enableInitialUndefVar = " << pipelineInfo->enableInitUndefZero << "\n"; dumpFile << "originUpperLeft = " << pipelineInfo->getGlState().originUpperLeft << "\n"; if (pipelineInfo->clientMetadataSize > 0) { @@ -1551,6 +1577,8 @@ MetroHash::Hash PipelineDumper::generateHashForGraphicsPipeline(const GraphicsPi hasher.Update(pipeline->unlinked); hasher.Update(pipeline->enableEarlyCompile); hasher.Update(pipeline->dynamicTopology); + hasher.Update(pipeline->enableInitUndefZero); + if (unlinkedShaderType == UnlinkedStageFragment && isCacheHash) hasher.Update(pipeline->enableColorExportShader); updateHashForPipelineOptions(&pipeline->options, &hasher, isCacheHash, unlinkedShaderType); @@ -2003,6 +2031,7 @@ void PipelineDumper::updateHashForPipelineShaderInfo(ShaderStage stage, const Pi hasher->Update(options.backwardPropagateNoContract); hasher->Update(options.forwardPropagateNoContract); hasher->Update(options.imageSampleDrefReturnsRgba); + hasher->Update(options.disableGlPositionOpt); } } } diff --git a/tool/vfx/vfx.h b/tool/vfx/vfx.h index af2a234456..2e2696315b 100644 --- a/tool/vfx/vfx.h +++ b/tool/vfx/vfx.h @@ -156,9 +156,6 @@ class Float32 { // Constructor, initializes our VfxFloat32 with another VfxFloat32 Float32(const Float32 &other) : m_bits(other.m_bits) {} - // Destructor - ~Float32() {} - // Gets the numeric value float GetValue() const { return *reinterpret_cast(&m_bits.u32All); } diff --git a/tool/vfx/vfxParser.cpp b/tool/vfx/vfxParser.cpp index a661e1d7de..a87ee705d5 100644 --- a/tool/vfx/vfxParser.cpp +++ b/tool/vfx/vfxParser.cpp @@ -54,6 +54,7 @@ namespace Vfx { // Parser functions to parse a value by it's type bool parseInt(char *str, unsigned lineNum, IUFValue *output); +bool parseUint(char *str, unsigned lineNum, IUFValue *output); bool parseFloat(char *str, unsigned lineNum, IUFValue *output); bool parseFloat16(char *str, unsigned lineNum, IUFValue *output); bool parseDouble(char *str, unsigned lineNum, IUFValue *output); @@ -434,6 +435,12 @@ bool Document::parseKeyValue(char *key, char *valueStr, unsigned lineNum, Sectio result = accessedSectionObject->set(lineNum, memberName, arrayIndex, &(value.iVec4[0])); break; } + case MemberTypeUint: { + result = parseUint(valueStr, lineNum, &value); + if (result) + result = accessedSectionObject->set(lineNum, memberName, arrayIndex, &(value.uVec4[0])); + break; + } case MemberTypeFloat16: { result = parseFloat16(valueStr, lineNum, &value); if (result) @@ -708,6 +715,31 @@ bool parseInt(char *str, unsigned lineNum, IUFValue *output) { return result; } +// ===================================================================================================================== +// Parses an unsigned int number from a string. +// +// @param str : Input string +// @param lineNum : Current line number +// @param [out] output : Stores parsed value +bool parseUint(char *str, unsigned lineNum, IUFValue *output) { + VFX_ASSERT(output); + bool result = true; + + bool isHex = false; + char *p0x = strstr(str, "0x"); + if (p0x) + isHex = true; + + output->uVec4[0] = strtoul(str, nullptr, 0); + + output->props.isInt64 = false; + output->props.isFloat = false; + output->props.isDouble = false; + output->props.isHex = isHex; + output->props.length = 1; + + return result; +} // ===================================================================================================================== // Parses a float number from a string. // diff --git a/tool/vfx/vfxPipelineDoc.cpp b/tool/vfx/vfxPipelineDoc.cpp index caad47db8e..6281b8690f 100644 --- a/tool/vfx/vfxPipelineDoc.cpp +++ b/tool/vfx/vfxPipelineDoc.cpp @@ -416,6 +416,8 @@ bool PipelineDocument::getPtrOfSubSection(Section *section, unsigned lineNum, co #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 73 CASE_SUBSECTION(MemberTypeGlState, SectionGlState) #endif + CASE_SUBSECTION(MemberTypeCompileConstItem, SectionCompileConstItem) + CASE_SUBSECTION(MemberTypeCompileConstInfo, SectionCompileConstInfo) default: result = Document::getPtrOfSubSection(section, lineNum, memberName, memberType, isWriteAccess, arrayIndex, ptrOut, errorMsg); diff --git a/tool/vfx/vfxSection.h b/tool/vfx/vfxSection.h index 7a817eb03f..1ac6337469 100644 --- a/tool/vfx/vfxSection.h +++ b/tool/vfx/vfxSection.h @@ -83,6 +83,7 @@ enum SectionType : unsigned { // Enumerates VFX member type. enum MemberType : unsigned { MemberTypeInt, // VFX member type: 32 bit integer + MemberTypeUint, // VFX member type: 32 bit unsigned integer MemberTypeFloat, // VFX member type: 32 bit float MemberTypeFloat16, // VFX member type: 16 bit float MemberTypeDouble, // VFX member type: 64 bit double @@ -116,6 +117,8 @@ enum MemberType : unsigned { MemberTypeSpecEntryItem, // VFX member type: SectionSpecEntryItem MemberTypeResourceMappingNode, // VFX member type: SectionResourceMappingNode MemberTypeSpecInfo, // VFX member type: SectionSpecInfo + MemberTypeCompileConstItem, // VFX member type: SectionCompileConstItem + MemberTypeCompileConstInfo, // VFX member type: SectionCompileConstInfo MemberTypeDescriptorRangeValue, // VFX member type: SectionDescriptorRangeValueItem MemberTypePipelineOption, // VFX member type: SectionPipelineOption MemberTypeShaderOption, // VFX member type: SectionShaderOption @@ -356,7 +359,7 @@ struct StrToMemberAddrArrayRef { class Section { public: Section(StrToMemberAddrArrayRef addrTable, SectionType type, const char *sectionName); - virtual ~Section() {} + virtual ~Section() = default; static SectionType getSectionType(const char *sectionName); static void initSectionInfo(); @@ -876,6 +879,83 @@ class SectionVertexInput : public Section { std::vector m_vbAddressLowBits; // Lowest two bits of vertex inputs offsets. }; +// ===================================================================================================================== +// Represents the sub section compile time constant map entry +class SectionCompileConstItem : public Section { +public: + typedef Vkgc::CompileTimeConst SubState; + + SectionCompileConstItem() : Section(getAddrTable(), SectionTypeUnset, "constItem") { + memset(&m_state, 0, sizeof(m_state)); + } + + void getSubState(SubState &state) { + state = m_state; + state.values.u32[0] = m_values.iVec4[0]; + state.values.u32[1] = m_values.iVec4[1]; + state.values.u32[2] = m_values.iVec4[2]; + state.values.u32[3] = m_values.iVec4[3]; + }; + SubState &getSubStateRef() { return m_state; }; + +private: + static StrToMemberAddrArrayRef getAddrTable() { + static std::vector addrTable = []() { + std::vector addrTableInitializer; + INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, offset, MemberTypeInt, false); + INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, set, MemberTypeUint, false); + INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, binding, MemberTypeInt, false); + INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, validBytes, MemberTypeInt, false); + INIT_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, m_values, MemberTypeIVec4, false); + return addrTableInitializer; + }(); + return {addrTable.data(), addrTable.size()}; + } + + SubState m_state; + IUFValue m_values = {}; +}; + +// ===================================================================================================================== +// Represents the sub section compile time constant info +class SectionCompileConstInfo : public Section { +public: + typedef Vkgc::CompileConstInfo SubState; + + SectionCompileConstInfo() : Section(getAddrTable(), SectionTypeUnset, "compileTimeConstants") { + memset(&m_state, 0, sizeof(m_state)); + } + + void getSubState(SubState &state) { + memset(&state, 0, sizeof(SubState)); + if (m_constItem.size()) { + m_state.numCompileTimeConstants = static_cast(m_constItem.size()); + m_compileConsts.resize(m_state.numCompileTimeConstants); + for (unsigned i = 0; i < m_compileConsts.size(); ++i) + m_constItem[i].getSubState(m_compileConsts[i]); + m_state.pCompileTimeConstants = &m_compileConsts[0]; + state = m_state; + } else + memset(&m_state, 0, sizeof(SubState)); + } + SubState &getSubStateRef() { return m_state; }; + +private: + static StrToMemberAddrArrayRef getAddrTable() { + static std::vector addrTable = []() { + std::vector addrTableInitializer; + INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstInfo, numCompileTimeConstants, MemberTypeInt, false); + INIT_MEMBER_DYNARRAY_NAME_TO_ADDR(SectionCompileConstInfo, m_constItem, MemberTypeCompileConstItem, true); + return addrTableInitializer; + }(); + return {addrTable.data(), addrTable.size()}; + } + + std::vector m_constItem; + std::vector m_compileConsts; + SubState m_state; +}; + // ===================================================================================================================== // Represents the sub section specialization constant map entry class SectionSpecEntryItem : public Section { diff --git a/tool/vfx/vfxVkSection.h b/tool/vfx/vfxVkSection.h index acdf016c59..f36c25ac15 100644 --- a/tool/vfx/vfxVkSection.h +++ b/tool/vfx/vfxVkSection.h @@ -257,6 +257,7 @@ class SectionShaderOption : public Section { INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, forwardPropagateNoContract, MemberTypeBool, false); INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, constantBufferBindingOffset, MemberTypeInt, false); INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, imageSampleDrefReturnsRgba, MemberTypeBool, false); + INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, disableGlPositionOpt, MemberTypeBool, false); return addrTableInitializer; }(); return {addrTable.data(), addrTable.size()}; @@ -481,6 +482,9 @@ class SectionPipelineOption : public Section { #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 73 m_glState.getSubState(m_state.glState); #endif + m_state.compileConstInfo = new Vkgc::CompileConstInfo(); + m_compileTimeConstants.getSubState(*m_state.compileConstInfo); + state.compileConstInfo = m_state.compileConstInfo; state = m_state; }; SubState &getSubStateRef() { return m_state; }; @@ -537,6 +541,7 @@ class SectionPipelineOption : public Section { #endif INIT_STATE_MEMBER_NAME_TO_ADDR(SectionPipelineOption, enablePrimGeneratedQuery, MemberTypeBool, false); INIT_STATE_MEMBER_NAME_TO_ADDR(SectionPipelineOption, disablePerCompFetch, MemberTypeBool, false); + INIT_MEMBER_NAME_TO_ADDR(SectionPipelineOption, m_compileTimeConstants, MemberTypeCompileConstInfo, true); return addrTableInitializer; }(); return {addrTable.data(), addrTable.size()}; @@ -544,6 +549,7 @@ class SectionPipelineOption : public Section { SubState m_state; SectionExtendedRobustness m_extendedRobustness; + SectionCompileConstInfo m_compileTimeConstants; // Compile time constant info #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 73 SectionGlState m_glState; #endif @@ -956,6 +962,7 @@ class SectionGraphicsState : public Section { INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, useSoftwareVertexBufferDescriptors, MemberTypeBool, false); INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_shaderLibrary, MemberTypeString, false); INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_rtState, MemberTypeRtState, true); + INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, enableInitUndefZero, MemberTypeBool, false); INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_clientMetadata, MemberTypeU8Array, false); INIT_MEMBER_ARRAY_NAME_TO_ADDR(SectionGraphicsState, m_uniformConstantMaps, MemberTypeUniformConstantMap, diff --git a/util/gpurtshim/GpurtShim.cpp b/util/gpurtshim/GpurtShim.cpp index 6e77b2c047..12f2265c8a 100644 --- a/util/gpurtshim/GpurtShim.cpp +++ b/util/gpurtshim/GpurtShim.cpp @@ -37,12 +37,6 @@ using namespace Vkgc; -void gpurt::getShaderLibrarySpirv(unsigned featureFlags, const void *&code, size_t &size) { - auto libCode = GpuRt::GetShaderLibraryCode(featureFlags); - code = libCode.pSpvCode; - size = libCode.spvSize; -} - RtIpVersion gpurt::getRtIpVersion(GfxIpVersion gfxIpVersion) { if (gfxIpVersion.major >= 11) return {2, 0}; @@ -71,6 +65,16 @@ static Pal::RayTracingIpLevel getRtIpLevel(RtIpVersion rtIpVersion) { abort(); } +void gpurt::getShaderLibrarySpirv(RtIpVersion rtIpVersion, unsigned featureFlags, const void *&code, size_t &size) { +#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 48 + auto libCode = GpuRt::GetShaderLibraryCode(featureFlags); +#else + auto libCode = GpuRt::GetShaderLibraryCode(getRtIpLevel(rtIpVersion), featureFlags); +#endif + code = libCode.pSpvCode; + size = libCode.spvSize; +} + static void unmangleDxilName(char *dst, const char *src) { // input "\01?RayQueryProceed1_1@@YA_NURayQueryInternal@@IV?$vector@I$02@@@Z" // output "RayQueryProceed1_1" diff --git a/version/include/llpcVersion.h.in b/version/include/llpcVersion.h.in index 48aec80a2d..773df94485 100644 --- a/version/include/llpcVersion.h.in +++ b/version/include/llpcVersion.h.in @@ -37,6 +37,9 @@ // %Version History // | %Version | Change Description | // | -------- | ----------------------------------------------------------------------------------------------------- | +// | 75.4 | Add disableGlPositionOpt to PipelineShaderOptions. | +// | 75.3 | Add enableInitUndefZero to GraphicPipelineBuildInfo | +// | 75.2 | Add CompileConstInfo to PipelineShaderOptions. | // | 75.1 | Add alphaFunc to GraphicPipelineBuildInfo. | // | 75.0 | BuildRayTracingPipeline now will not generate kernel entry for pipeline library anymore. | // | 74.2 | Add enableMapClipDistMask to GraphicsPipelineBuildInfo. | @@ -193,7 +196,7 @@ #define LLPC_INTERFACE_MAJOR_VERSION 75 /// LLPC minor interface version. -#define LLPC_INTERFACE_MINOR_VERSION 0 +#define LLPC_INTERFACE_MINOR_VERSION 4 /// The client's LLPC major interface version #ifndef LLPC_CLIENT_INTERFACE_MAJOR_VERSION