diff --git a/.github/workflows/check-amdllpc-docker.yml b/.github/workflows/check-amdllpc-docker.yml
index bb9a75f2fe..8f660b2cf8 100644
--- a/.github/workflows/check-amdllpc-docker.yml
+++ b/.github/workflows/check-amdllpc-docker.yml
@@ -62,13 +62,13 @@ jobs:
           echo "${{ github.event.pull_request.number }}" > pr_num.txt
       - name: Upload code coverage report as a GitHub artifact
         if:  contains(matrix.feature-set, '+coverage') && github.event.pull_request.number
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: cov_report_${{ env.CONFIG_TAG }}
           path: ${{ env.COVERAGE_REPORT_FILES }}
       - name: Upload the PR number as a GitHub artifact
         if: contains(matrix.feature-set, '+coverage') && github.event.pull_request.number
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: pr_num
           path: pr_num.txt
diff --git a/cmake/continuations.cmake b/cmake/continuations.cmake
deleted file mode 100644
index f13118c443..0000000000
--- a/cmake/continuations.cmake
+++ /dev/null
@@ -1,34 +0,0 @@
-##
- #######################################################################################################################
- #
- #  Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved.
- #
- #  Permission is hereby granted, free of charge, to any person obtaining a copy
- #  of this software and associated documentation files (the "Software"), to
- #  deal in the Software without restriction, including without limitation the
- #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- #  sell copies of the Software, and to permit persons to whom the Software is
- #  furnished to do so, subject to the following conditions:
- #
- #  The above copyright notice and this permission notice shall be included in all
- #  copies or substantial portions of the Software.
- #
- #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- #  IN THE SOFTWARE.
- #
- #######################################################################################################################
-
-set(LLPC_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/..")
-
-include("${LLPC_SOURCE_DIR}/cmake/llvmraytracing.cmake")
-
-# Deprecated transition macro for refactoring transition; use add_llvmraytracing_projects instead
-macro(add_continuations_projects)
-  add_llvmraytracing_projects()
-  set(LLPC_RAYTRACING_ADD_TRANSITION_TARGETS ON)
-endmacro()
diff --git a/compilerutils/CMakeLists.txt b/compilerutils/CMakeLists.txt
index 4aa8824093..5e6eb5b0b0 100644
--- a/compilerutils/CMakeLists.txt
+++ b/compilerutils/CMakeLists.txt
@@ -17,6 +17,10 @@ add_llvm_library(LLVMCompilerUtils
   lib/DxilToLlvm.cpp
   lib/TypeLowering.cpp
   lib/TypesMetadata.cpp
+  lib/ValueOriginTracking.cpp
+  lib/ValueOriginTrackingTestPass.cpp
+  lib/ValueSpecialization.cpp
+  lib/ValueSpecializationTestPass.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/compilerutils/include/compilerutils/CompilerUtils.h b/compilerutils/include/compilerutils/CompilerUtils.h
index e273499d47..207df2eef5 100644
--- a/compilerutils/include/compilerutils/CompilerUtils.h
+++ b/compilerutils/include/compilerutils/CompilerUtils.h
@@ -118,6 +118,8 @@ class CrossModuleInliner {
   // target module.
   llvm::GlobalValue *findCopiedGlobal(llvm::GlobalValue &sourceGv, llvm::Module &targetModule);
 
+  static std::string getCrossModuleName(llvm::GlobalValue &gv);
+
 private:
   // Checks that we haven't processed a different target module earlier.
   void checkTargetModule(llvm::Module &targetModule) {
diff --git a/compilerutils/include/compilerutils/TypesMetadata.h b/compilerutils/include/compilerutils/TypesMetadata.h
index 2e319de7c2..52b0563f1c 100644
--- a/compilerutils/include/compilerutils/TypesMetadata.h
+++ b/compilerutils/include/compilerutils/TypesMetadata.h
@@ -65,7 +65,7 @@ class TypedFuncTy {
   // Construct a TypedFuncTy for the given result type and arg types.
   // This constructs the !pointeetys metadata; that can then be attached to a function
   // using writeMetadata().
-  TypedFuncTy(TypedArgTy ResultTy, ArrayRef<TypedArgTy> ArgTys);
+  TypedFuncTy(TypedArgTy ResultTy, ArrayRef<TypedArgTy> ArgTys, bool IsVarArg = false);
 
   // Get a TypedFuncTy for the given Function, looking up the !pointeetys metadata.
   static TypedFuncTy get(const Function *F);
diff --git a/compilerutils/include/compilerutils/ValueOriginTracking.h b/compilerutils/include/compilerutils/ValueOriginTracking.h
new file mode 100644
index 0000000000..6e3d9215d4
--- /dev/null
+++ b/compilerutils/include/compilerutils/ValueOriginTracking.h
@@ -0,0 +1,275 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  ValueOriginTracking.h
+ * @brief Helpers for tracking the byte-wise origin of SSA values.
+ *
+ * @details
+ * Sometimes we are interested in the byte-wise contents of a value.
+ * If the value is a constant, this can be determined with standard LLVM helpers like computeKnownBits,
+ * but even if the value is dynamic it can be helpful to trace where these bytes come from.
+ *
+ * For instance, if some outgoing function arguments de-facto preserve incoming function arguments in the same argument
+ * slot, then this information may be used to enable certain inter-procedural optimizations.
+ *
+ * This file provides helpers for such an analysis.
+ * It can be thought of splitting values into "slices" (e.g. bytes or dwords), and performing an analysis of where
+ * these values come from, propagating through things like {insert,extract}{value,element}.
+ * Using single-byte slices results in a potentially more accurate analysis, but has higher runtime cost.
+ * For every value, the analysis works on the in-memory layout of its type, including padding, even though we analyze
+ * only SSA values that might end up in registers.
+ * It can be thought of as describing the memory obtained from storing a value to memory.
+ *
+ * In that sense, it is similar to how SROA splits up allocas into ranges, and analyses ranges separately.
+ * However, we only track contents of SSA values, and do not propagate through memory, and thus generally
+ * SROA should have been run before to eliminate non-necessary memory operations.
+ *
+ * If the client code has extra information on the origin of some intermediate values that this analysis cannot reason
+ * about, e.g. calls to special functions, or special loads, then it can provide this information in terms of
+ * assumptions, which use the same format as the analysis result, mapping slices of a value to slices of other values or
+ * constants. When analyzing a value with an assumption on it, the algorithm then applies the analysis result for
+ * values referenced by assumptions, and propagates the result through following instructions.
+ *
+ * The analysis does not modify functions, however, as part of the analysis, additional constants may be created.
+ *
+ * The motivating application that we have implemented this for is propagating constant known arguments into the
+ * Traversal shader in continuations-based ray tracing:
+ *
+ * The Traversal shader is enqueued by potentially multiple call sites in RayGen (RGS), Closest-Hit (CHS) or Miss (MS)
+ * shaders. If all these call sites share some common constant arguments (e.g. on the ray payload), then we may
+ * want to propagate these constants into the Traversal shader to reduce register pressure.
+ * On these call sites, a simple analysis based on known constant values suffices.
+ *
+ * However, the Traversal shader is re-entrant, and may enqueue itself. Also, with Any-Hit (AHS) and/or Intersection
+ * (IS) shaders in the pipeline, these shaders are enqueued by Traversal, which in turn re-enqueue Traversal.
+ *
+ * Thus, in order to prove that incoming arguments of the Traversal shader are known constants, we need to prove
+ * that all TraceRay call sites share these constants, *and* that all functions that might re-enqueue Traversal
+ * (Traversal itself, AHS, IS) preserve these arguments, or set it to the same constant.
+ *
+ * This analysis allows all of the above: It allows to prove that certain outgoing arguments at TraceRay call sites
+ * have a specific constant value, and allow to prove that outgoing arguments of Traversal/AHS/IS preserve the
+ * corresponding incoming ones, or more precisely, that argument slots are preserved.
+ * Because we track on a fine granularity (e.g. dwords), we might be able to prove that parts of a struct argument are
+ * preserved even if some fields of it are changed.
+ *
+ ***********************************************************************************************************************
+ */
+
+#pragma once
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+
+namespace llvm {
+class raw_ostream;
+class Constant;
+class DataLayout;
+class Function;
+class Instruction;
+class Value;
+} // namespace llvm
+
+namespace CompilerUtils {
+
+namespace ValueTracking {
+
+// enum wrapper with some convenience helpers for common operations.
+// The contained value is a bitmask of status, and thus multiple status can be set.
+// In that case we know that at run time, one of the status holds, but we don't know which one.
+// This can occur with phi nodes and select instructions.
+// In the common cases, just a single bit is set though.
+struct SliceStatus {
+  // As the actual enum is contained within the struct, its values don't leak into the containing namespace,
+  // and it's not possible to implicitly cast a SliceStatus to an int, so it's as good as an enum class.
+  enum StatusEnum : uint32_t { Constant = 0x1, Dynamic = 0x2, UndefOrPoison = 0x4 };
+  StatusEnum S = {};
+
+  SliceStatus(StatusEnum S) : S{S} {}
+
+  static SliceStatus makeEmpty() { return static_cast<StatusEnum>(0); }
+
+  // Returns whether all status bits set in other are also set in us.
+  bool contains(SliceStatus Other) const { return (*this & Other) == Other; }
+
+  // Returns whether no status bits are set.
+  bool isEmpty() const { return static_cast<uint32_t>(S) == 0; }
+
+  // Returns whether there is exactly one status bit set. Returns false for an empty status.
+  bool isSingleStatus() const {
+    auto AsInt = static_cast<uint32_t>(S);
+    return (AsInt != 0) && (((AsInt - 1) & AsInt) == 0);
+  }
+
+  SliceStatus operator&(SliceStatus Other) const { return static_cast<StatusEnum>(S & Other.S); }
+
+  SliceStatus operator|(SliceStatus Other) const { return static_cast<StatusEnum>(S | Other.S); }
+
+  bool operator==(SliceStatus Other) const { return S == Other.S; }
+  bool operator!=(SliceStatus Other) const { return !(S == Other.S); }
+};
+
+static constexpr unsigned MaxSliceSize = 4; // Needed for SliceInfo::ConstantValue
+
+// A slice consists of a consecutive sequence of bytes within the representation of a value.
+// We keep track of a potential constant value, and a potential dynamic value that determines
+// the byte representation of our slice.
+// If both dynamic and constant values are set, then one of them determines the byte representation
+// of our slice, but we don't know which.
+// If just a single value is set, then we know that that one determines us.
+//
+// Allowing both a dynamic and a constant value is intended to allow patterns where a value
+// is either a constant, or a passed-through argument. If the constant matches the values used
+// to initialize the incoming argument on the caller side, then we can still prove that the value
+// is in fact constant.
+//
+// If the bit width of a value is not a multiple of the slice size, the last slice contains
+// unspecified high bits. These are not guaranteed to be zeroed out.
+struct SliceInfo {
+  SliceInfo(SliceStatus S) : Status{S} {}
+  void print(llvm::raw_ostream &OS, bool Compact = false) const;
+
+  // Enum-bitmask of possible status of the value.
+  SliceStatus Status = SliceStatus::makeEmpty();
+  uint32_t ConstantValue = 0;
+  static_assert(sizeof(ConstantValue) >= MaxSliceSize);
+  // If set, the byte representation of this slice is obtained
+  // from the given value at the given offset.
+  llvm::Value *DynamicValue = nullptr;
+  unsigned DynamicValueByteOffset = 0;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SliceInfo &BI);
+
+// Combines slice infos for a whole value, unless the value is too large, in which case it might be cut off.
+// It is up to client code to detect missing slice infos at the value tail if that is relevant,
+// e.g. in order to prove that all bytes in a value match some assumption.
+struct ValueInfo {
+  void print(llvm::raw_ostream &OS, bool Compact = false) const;
+
+  // Infos for the byte-wise representation of a value, partitioned into consecutive slices
+  llvm::SmallVector<SliceInfo> Slices;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const ValueInfo &VI);
+
+} // namespace ValueTracking
+
+// Utility class to track the origin of values, partitioned into slices of e.g. 1 or 4 bytes each.
+// See the documentation at the top of this file for details.
+//
+// The status of each slice is given by its SliceStatus.
+// If the size of a value exceeds MaxBytesPerValue, then only a prefix of that size is analyzed.
+// This ensures bounded runtime and memory consumption on pathological cases with huge values.
+//
+// This is intended to be used for interprocedural optimizations, detecting cases where arguments are initialized with a
+// constant and then always propagated, allowing to replace the argument by the initial constant.
+class ValueOriginTracker {
+public:
+  using ValueInfo = ValueTracking::ValueInfo;
+  // In some cases, client code has additional information on where values originate from, or
+  // where they should be assumed to originate from just for the purpose of the analysis.
+  // For instance, if a value is spilled and then re-loaded, value origin tracking
+  // would consider the reloaded value as unknown dynamic, because it doesn't track memory.
+  // Value origin assumptions allow the client to provide such extra information.
+  // For each registered value, when the analysis reaches the given value, it will instead rely on the supplied
+  // ValueInfo, and replace dynamic references by the analysis result for these dynamic values.
+  // This means that when querying values for which assumptions were given, it is *not* ensured that
+  // the exact assumptions are returned.
+  //
+  // Consider this example using dword slices:
+  //    %equals.3 = add i32 3, 0
+  //    %unknown = call i32 @opaque()
+  //    %arr.0 = insertvalue [3 x i32] poison, i32 %equals.3, 0
+  //    %arr.1 = insertvalue [3 x i32] %arr.0, i32 %unknown, 1
+  //    %arr.stored = insertvalue [3 x i32] %arr.1, i32 %unknown, 2
+  //    store [3 x i32] %arr.stored, ptr %ptr
+  //    %reloaded = load [3 x i32], ptr %ptr
+  // We supply the assumption that the first two dwords of %reloaded are in fact the first two dwords of
+  // %arr.stored, and that the third dword equals 7 (because we have some additional knowledge somehow).
+  // Then, when querying %reloaded, the result will be:
+  //  * dword 0: constant: 0x3 (result of the add)
+  //  * dword 1: dynamic: %unknown (offset 0)
+  //  * dword 2: constant: 0x7
+  //
+  // If only some slices are known, the other slices can use the fallback of point to the value itself.
+  // For values with assumptions, we skip the analysis we'd perform otherwise, so adding assumptions can
+  // lead to worse analysis results on values that can be analyzed. For now, this feature however
+  // is intended for values that are otherwise opaque. Support for merging with the standard analysis could be added.
+  //
+  // For now, only assumptions on instructions are supported.
+  // The intended uses of this feature only require it for instructions, and support for non-instructions
+  // is a bit more complicated but can be added if necessary.
+  // Also, only a single status on assumptions is allowed.
+  using ValueOriginAssumptions = llvm::DenseMap<llvm::Instruction *, ValueInfo>;
+
+  ValueOriginTracker(const llvm::DataLayout &DL, unsigned BytesPerSlice = 4, unsigned MaxBytesPerValue = 512,
+                     ValueOriginAssumptions OriginAssumptions = ValueOriginAssumptions{})
+      : DL{DL}, BytesPerSlice{BytesPerSlice}, MaxBytesPerValue{MaxBytesPerValue},
+        OriginAssumptions(std::move(OriginAssumptions)) {}
+
+  // Computes a value info for the given value.
+  // If the value has been seen before, returns a cache hit from the ValueInfos map.
+  // When querying multiple values within the same functions, it is more efficient
+  // to first run analyzeValues() on all of them together.
+  ValueInfo getValueInfo(llvm::Value *V);
+
+  // Analyze a set of values in bulk for efficiency.
+  // Value analysis needs to process whole functions, so analysing multiple values within the same
+  // function allows to use a single pass for them all.
+  // The passed values don't have to be instructions, and don't have to be in the same functions,
+  // although there is no perf benefit in that case.
+  // Values may contain duplicates.
+  void analyzeValues(llvm::ArrayRef<llvm::Value *> Values);
+
+private:
+  struct ValueInfoBuilder;
+  const llvm::DataLayout &DL;
+  unsigned BytesPerSlice = 0;
+  unsigned MaxBytesPerValue = 0;
+  ValueOriginAssumptions OriginAssumptions;
+  llvm::DenseMap<llvm::Value *, ValueInfo> ValueInfos;
+
+  // Analyze a value, creating a ValueInfo for it.
+  // If V is an instruction, this assumes the ValueInfos of dependencies have
+  // already been created. If some miss, we assume cyclic dependencies and give up
+  // on this value.
+  ValueInfo computeValueInfo(llvm::Value *V);
+  // Same as above, implementing constant analysis
+  ValueInfo computeConstantValueInfo(ValueInfoBuilder &VIB, llvm::Constant *C);
+  // Given an origin assumption, compute a value info that combines analysis results
+  // of the values referenced by the assumption.
+  ValueInfo computeValueInfoFromAssumption(ValueInfoBuilder &VIB, const ValueInfo &OriginAssumption);
+
+  // Implementation function for analyzeValues():
+  // Ensures that the ValueInfos map contains an entry for V, by optionally computing a value info first.
+  // Then, return a reference to the value info object within the map.
+  // The resulting reference is invalidated if ValueInfos is mutated.
+  // Assumes that all values this depends on have already been analyzed, except for phi nodes,
+  // which are handled pessimistically in case of loops.
+  ValueInfo &getOrComputeValueInfo(llvm::Value *V, bool KnownToBeNew = false);
+};
+
+} // namespace CompilerUtils
diff --git a/compilerutils/include/compilerutils/ValueSpecialization.h b/compilerutils/include/compilerutils/ValueSpecialization.h
new file mode 100644
index 0000000000..e1ca13c231
--- /dev/null
+++ b/compilerutils/include/compilerutils/ValueSpecialization.h
@@ -0,0 +1,176 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  ValueSpecialization.h
+ * @brief Helpers for changing the dword-wise representation of a value.
+ *
+ * @details
+ * Utility to replace dwords in the byte-wise representation of generic values by known constants or frozen poison.
+ *
+ * This is equivalent to storing a value to an alloca, then replacing some dwords, and then reading the value
+ * back, but does so without introducing an alloca, and instead directly working on the SSA value using
+ * {insert,extract}{value,element} instructions, and bit-wise ops for 64-bit values.
+ *
+ * Replacements are not guaranteed to succeed in the general case. Unsupported cases include:
+ *  * dwords covering scalars smaller than a dword (e.g. i16)
+ *  * dwords covering non-dword-aligned scalars
+ *
+ * Thus, this helper is intended for cases where we do not rely on the replacement for functional correctness,
+ * but instead apply it as an optimization, e.g. for constant propagation, and prefer to do that
+ * without introducing an alloca. This application motivates the name: The value is specialized for known
+ * constant contents when used in a particular context.
+ *
+ * If needed, the mechanism could be extended to allow replacement of dwords by dynamic values.
+ *
+ ***********************************************************************************************************************
+ */
+
+#pragma once
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/IR/IRBuilder.h>
+
+namespace llvm {
+class DataLayout;
+class Value;
+class Module;
+class StringRef;
+} // namespace llvm
+
+namespace CompilerUtils {
+
+class ValueSpecializer {
+public:
+  enum class SpecializationKind {
+    None = 0,     // Keep the dword in the value as-is.
+    Constant,     // Replace by a constant.
+    FrozenPoison, // Replace by a frozen poison value. We specialize with frozen poisons to prevent propagation
+                  // of poison into the containing value. For instance, ORing a zext'ed non-frozen i32 poison into an
+                  // i64 poisons the whole i64.
+    Count
+  };
+
+  struct DwordSpecializationInfo {
+    SpecializationKind Kind = SpecializationKind::None;
+    uint32_t ConstantValue = 0;
+  };
+
+  // An instance of this class can be re-used for multiple replacements on multiple values.
+  // This allows to re-use the builder insertion point, which can lead to nicer (e.g. for tests) IR.
+  ValueSpecializer(llvm::Module &M);
+
+  // The IR builder stores a reference to us, so forbid copy and move.
+  ValueSpecializer(const ValueSpecializer &) = delete;
+  ValueSpecializer(ValueSpecializer &&) = delete;
+  ValueSpecializer &operator=(const ValueSpecializer &) = delete;
+  ValueSpecializer &operator=(ValueSpecializer &&) = delete;
+
+  // Replaces dwords in Val according to DwordInfos, and returns the result.
+  // Returns nullptr on failure, of if nothing was changed.
+  //
+  // Val needs to be an instruction or an argument (so we have a function to put new instructions in).
+  // For arguments, new instructions for specialization are added to the function entry block.
+  // For instructions, new instructions are added immediately after the specialized instruction.
+  //
+  // If ReplaceUses is set, then all uses of Val are replaced with the result, excluding new instructions that
+  // are added as part of the replacement.
+  //
+  // If PreserveExistingBuilderInsertionPoint is set, and this is not the first call of this function,
+  // we preserve the builder insertion point. In that case, it is the caller's responsibility to ensure that
+  // the definition of Val dominates the current insertion point.
+  // If the insertion point is reset, it is set to immediately after the replaced instruction, or after the last
+  // alloca instruction in the function's entry block for arguments.
+  // During the replacement, we do not change the insertion point, and just add instructions.
+  // Thus, it is e.g. safe to preserve the insertion point when only specializing function arguments.
+  //
+  // Replacement values of the same type as Val reuse Val's name, plus NameSuffix.
+  // Temporaries of nested types are not given names.
+  struct ReplacementResult {
+    llvm::Value *Replacement; // nullptr if no replacement was done
+    unsigned NumReplacedDwords;
+  };
+  ReplacementResult replaceDwords(llvm::Value *Val, llvm::ArrayRef<DwordSpecializationInfo> DwordInfos,
+                                  bool ReplaceUses, bool PreservePreviousInsertionPoint,
+                                  llvm::StringRef NameSuffix = ".specialized");
+
+private:
+  // We use a callback to keep track of new instructions, which need to be skipped in the final RAUW.
+  llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderCallbackInserter> B;
+  const llvm::DataLayout &DL;
+  llvm::Type *I32 = nullptr;
+  llvm::Type *I64 = nullptr;
+  bool IsFirstCall = true;
+
+  // Per-run data:
+  unsigned NumReplacedDwords = 0;
+  llvm::SmallDenseSet<llvm::Value *> NewInsts;
+
+  llvm::Value *getFrozenPoison(llvm::Type *Ty) { return B.CreateFreeze(llvm::PoisonValue::get(Ty)); }
+  llvm::Value *getI32Constant(uint32_t ConstantValue) { return llvm::ConstantInt::get(I32, ConstantValue); }
+  llvm::Value *getI64Constant(uint64_t ConstantValue) { return llvm::ConstantInt::get(I64, ConstantValue); }
+
+  // Replace dwords in Val according to DwordInfos, and return the result.
+  // Val may be nullptr if all dwords in DwordInfos are specialized,
+  // meaning the result does not depend on the initial value.
+  llvm::Value *replaceDwordsInNonAggregate(llvm::Type *Ty, llvm::Value *Val,
+                                           llvm::ArrayRef<DwordSpecializationInfo> DwordInfos,
+                                           llvm::StringRef ReplacementName);
+
+  // Replaces dwords in RootVal according to DwordInfos. Handles both aggregate as well as non-aggregate types.
+  // Returns the modified value, and nullptr upon failure, or if nothing was changed.
+  //
+  //  * RootVal: The value we want to replace dwords to constants in.
+  //  * Indices: If RootVal is an aggregate, these indices point to a nested value in RootVal
+  //             that this recursive function call should handle. In that case,
+  //             CurTy and DwordInfos refer to that nested value.
+  //  * CurTy: Type of the (possibly nested) value within RootVal to change.
+  //  * DwordInfos: Dword-wise infos on what to change.
+  //
+  // For aggregate types, it recurses into each element, using the same root value,
+  // populating Indices and CurTy, and restricting DwordInfos to the sub-range according to the element.
+  // Once we reach a non-aggregate type, we extractvalue that element, apply the non-aggregate replacement,
+  // and insertvalue the result.
+  // In case the whole element is replaced, we skip the extractvalue and start with a frozen poison value instead if
+  // necessary.
+  //
+  // The goal is to emit insertvalue instructions that directly insert into the leaf level,
+  // instead of first extracting a nested (possibly aggregate!) value, then extracting nested values,
+  // then specializing the nested value, inserting the nested value into the element value, and then
+  // inserting the element value into the struct.
+  // For example, when specializing dword 1 to 17 in { { i32, i32 }, i32 } %foo, we want to emit
+  //    %foo.specialized = insertvalue { { i32, i32 }, i32 } %foo, i32 17, 0, 1
+  // instead of the naive
+  //    %nested = extractvalue { { i32, i32 }, i32 } %foo, 0
+  //    %nested.specialized = insertvalue { i32, i32 } %nested, i32 17, 1
+  //    %foo.specialized = insertvalue { { i32, i32 }, i32 } %foo, { i32, i32 } %nested.specialized, 0
+  //
+  // For non-aggregates, this is just a wrapper around replaceDwordsInNonAggregate.
+  llvm::Value *replaceDwordsImpl(llvm::Value *RootVal, llvm::SmallVectorImpl<unsigned> &Indices, llvm::Type *CurTy,
+                                 llvm::ArrayRef<DwordSpecializationInfo> DwordInfos, llvm::StringRef ReplacementName);
+};
+
+} // namespace CompilerUtils
diff --git a/compilerutils/lib/CompilerUtils.cpp b/compilerutils/lib/CompilerUtils.cpp
index e454db298a..80ba2b0d2b 100644
--- a/compilerutils/lib/CompilerUtils.cpp
+++ b/compilerutils/lib/CompilerUtils.cpp
@@ -24,6 +24,8 @@
  **********************************************************************************************************************/
 
 #include "compilerutils/CompilerUtils.h"
+#include "ValueOriginTrackingTestPass.h"
+#include "ValueSpecializationTestPass.h"
 #include "compilerutils/DxilToLlvm.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/iterator_range.h"
@@ -162,17 +164,6 @@ void CompilerUtils::setIsLastUseLoad(llvm::LoadInst &Load) {
 
 namespace {
 
-// Get the name of a global that is copied to a different module for inlining.
-std::string getCrossModuleName(GlobalValue &gv) {
-  if (auto *fn = dyn_cast<Function>(&gv)) {
-    // Intrinsics should not be renamed since the IR verifier insists on a "correct" name mangling based on any
-    // overloaded types. Lgc dialects also require exact name for similar reason.
-    if (fn->isIntrinsic() || fn->getName().starts_with("lgc."))
-      return fn->getName().str();
-  }
-  return (Twine(gv.getName()) + ".cloned." + gv.getParent()->getName()).str();
-}
-
 class CrossModuleValueMaterializer : public ValueMaterializer {
 public:
   CrossModuleValueMaterializer(Module *targetMod, CompilerUtils::CrossModuleInliner &inliner,
@@ -198,7 +189,7 @@ class CrossModuleValueMaterializer : public ValueMaterializer {
     if (auto *existing = inliner->findCopiedGlobal(*gv, *targetMod))
       return existing;
 
-    auto newName = getCrossModuleName(*gv);
+    auto newName = CompilerUtils::CrossModuleInliner::getCrossModuleName(*gv);
     if (auto *callee = dyn_cast<Function>(gv)) {
       if (!callee->isDeclaration()) {
         report_fatal_error(
@@ -386,6 +377,17 @@ GlobalValue *CompilerUtils::CrossModuleInliner::findCopiedGlobal(GlobalValue &so
   return gv;
 }
 
+// Get the name of a global that is copied to a different module for inlining.
+std::string CompilerUtils::CrossModuleInliner::getCrossModuleName(GlobalValue &gv) {
+  if (auto *fn = dyn_cast<Function>(&gv)) {
+    // Intrinsics should not be renamed since the IR verifier insists on a "correct" name mangling based on any
+    // overloaded types. Lgc dialects also require exact name for similar reason.
+    if (fn->isIntrinsic() || fn->getName().starts_with("lgc."))
+      return fn->getName().str();
+  }
+  return (Twine(gv.getName()) + ".cloned." + gv.getParent()->getName()).str();
+}
+
 PointerType *llvm::getWithSamePointeeType(PointerType *ptrTy, unsigned addressSpace) {
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 482880
   return PointerType::getWithSamePointeeType(ptrTy, addressSpace);
diff --git a/compilerutils/lib/DxilToLlvm.cpp b/compilerutils/lib/DxilToLlvm.cpp
index 47afe786ab..0ee0c77d62 100644
--- a/compilerutils/lib/DxilToLlvm.cpp
+++ b/compilerutils/lib/DxilToLlvm.cpp
@@ -283,6 +283,26 @@ struct DxilToLlvmPassImpl {
     m_typeLower.eraseInstruction(&extractElement);
   }
 
+  void visitShuffleVector(llvm::ShuffleVectorInst &shuffleVector) {
+    Value *inputVector0 = shuffleVector.getOperand(0);
+    Value *inputVector1 = shuffleVector.getOperand(1);
+    ArrayRef<int> shuffleMask = shuffleVector.getShuffleMask();
+    Type *elementTy = cast<VectorType>(inputVector0->getType())->getElementType();
+    assert(cast<VectorType>(inputVector1->getType())->getElementType() == elementTy);
+
+    if (convertVectorElementType(elementTy) == nullptr)
+      return;
+
+    Value *convertedInputVector0 = getConvertedValue(inputVector0);
+    Value *convertedInputVector1 = getConvertedValue(inputVector1);
+
+    IRBuilder<> builder(&shuffleVector);
+    auto *replacement =
+        builder.CreateShuffleVector(convertedInputVector0, convertedInputVector1, shuffleMask, shuffleVector.getName());
+
+    m_typeLower.replaceInstruction(&shuffleVector, replacement);
+  }
+
   void visitGEP(llvm::GetElementPtrInst &gepInst) {
     Type *oldTy = gepInst.getSourceElementType();
     Type *newTy = getConvertedType(oldTy);
@@ -322,6 +342,7 @@ struct DxilToLlvmPassImpl {
                                     .nest(&TypeLowering::registerVisitors)
                                     .add(&DxilToLlvmPassImpl::visitInsertElement)
                                     .add(&DxilToLlvmPassImpl::visitExtractElement)
+                                    .add(&DxilToLlvmPassImpl::visitShuffleVector)
                                     .add(&DxilToLlvmPassImpl::visitGEP)
                                     .build();
     fixFunctionTypes();
diff --git a/compilerutils/lib/PassRegistry.inc b/compilerutils/lib/PassRegistry.inc
index 13b599016a..385580b1d6 100644
--- a/compilerutils/lib/PassRegistry.inc
+++ b/compilerutils/lib/PassRegistry.inc
@@ -38,6 +38,8 @@
 #endif
 
 COMPILERUTILS_MODULE_PASS("dxil-to-llvm", DxilToLlvmPass())
+COMPILERUTILS_MODULE_PASS("value-origin-tracking-test", ValueOriginTrackingTestPass())
+COMPILERUTILS_MODULE_PASS("value-specialization-test", ValueSpecializationTestPass())
 
 #undef COMPILERUTILS_PASS
 #undef COMPILERUTILS_MODULE_PASS
diff --git a/compilerutils/lib/TypesMetadata.cpp b/compilerutils/lib/TypesMetadata.cpp
index 23f0e01004..c69790fe3a 100644
--- a/compilerutils/lib/TypesMetadata.cpp
+++ b/compilerutils/lib/TypesMetadata.cpp
@@ -101,7 +101,7 @@ TypedFuncTy TypedFuncTy::get(const Function *F) {
 // Construct a TypedFuncTy for the given result type and arg types.
 // This constructs the !pointeetys metadata; that can then be attached to a function
 // using writeMetadata().
-TypedFuncTy::TypedFuncTy(TypedArgTy ResultTy, ArrayRef<TypedArgTy> ArgTys) {
+TypedFuncTy::TypedFuncTy(TypedArgTy ResultTy, ArrayRef<TypedArgTy> ArgTys, bool IsVarArg) {
   SmallVector<Type *> BareArgTys;
   SmallVector<Metadata *> PointeeTys;
   unsigned SimpleFormatArgIdx = UINT_MAX;
@@ -133,7 +133,7 @@ TypedFuncTy::TypedFuncTy(TypedArgTy ResultTy, ArrayRef<TypedArgTy> ArgTys) {
       }
     }
   }
-  FuncTy = FunctionType::get(ResultTy.asType(), BareArgTys, /*isVarArg=*/false);
+  FuncTy = FunctionType::get(ResultTy.asType(), BareArgTys, IsVarArg);
   if (!PointeeTys.empty())
     Meta = MDTuple::get(FuncTy->getContext(), PointeeTys);
 }
@@ -239,6 +239,6 @@ void llvm::DXILValueTypeMetadataCallback(Value *V, unsigned TypeID, GetTypeByIDT
       else
         ArgTys.push_back(ArgTy);
     }
-    TypedFuncTy(ReturnTy, ArgTys).writeMetadata(cast<Function>(V));
+    TypedFuncTy(ReturnTy, ArgTys, FuncTy->isVarArg()).writeMetadata(cast<Function>(V));
   }
 }
diff --git a/compilerutils/lib/ValueOriginTracking.cpp b/compilerutils/lib/ValueOriginTracking.cpp
new file mode 100644
index 0000000000..a5f57e7d1f
--- /dev/null
+++ b/compilerutils/lib/ValueOriginTracking.cpp
@@ -0,0 +1,826 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#include "compilerutils/ValueOriginTracking.h"
+#include <llvm/ADT/PostOrderIterator.h>
+#include <llvm/ADT/SetVector.h>
+#include <llvm/Analysis/ValueTracking.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Value.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+
+#define DEBUG_TYPE "value-origin-tracking"
+
+using namespace CompilerUtils;
+using namespace CompilerUtils::ValueTracking;
+using namespace llvm;
+
+namespace CompilerUtils {
+
+namespace {
+
+// Given indices into an aggregate type used in {extract,insert}value instructions,
+// compute the byte offset of the value indexed by the indices.
+static unsigned computeByteOffsetInAggregate(Type *AggTy, ArrayRef<unsigned> Indices, const DataLayout &DL) {
+  Type *I32 = IntegerType::getInt32Ty(AggTy->getContext());
+  // Compute the byte offset of the extracted value by essentially interpreting the indices as GEP indices
+  // TODO: Can we do this without the GEP hack, and without re-implementing aggregate bit layouts?
+  SmallVector<Value *> GEPIndices;
+  GEPIndices.reserve(Indices.size());
+  GEPIndices.push_back(ConstantInt::getSigned(I32, 0));
+  for (auto Idx : Indices)
+    GEPIndices.push_back(ConstantInt::getSigned(I32, Idx));
+
+  APInt APOffset{32, 0};
+  [[maybe_unused]] bool Success = GEPOperator::accumulateConstantOffset(AggTy, GEPIndices, DL, APOffset);
+  // This should always succeed with constant indices
+  assert(Success);
+
+  unsigned Offset = APOffset.getZExtValue();
+  return Offset;
+}
+
+static std::optional<unsigned> computeByteOffsetInVector(Type *VecTy, Value *IndexArg, const DataLayout &DL) {
+  auto *ConstantIndex = dyn_cast<ConstantInt>(IndexArg);
+  if (!ConstantIndex)
+    return std::nullopt;
+
+  Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
+  unsigned BitWidth = DL.getTypeSizeInBits(ElemTy);
+  if (BitWidth % 8)
+    return std::nullopt;
+
+  unsigned Index = ConstantIndex->getZExtValue();
+  return Index * (BitWidth / 8);
+}
+
+// Combine slice infos for a select or phi instruction, so we know that our slice equals
+// one of the given slices, but we don't know which.
+std::optional<SliceInfo> static combineSliceInfosForSelect(ArrayRef<const SliceInfo *> Slices) {
+  if (Slices.empty())
+    return std::nullopt;
+  if (Slices.size() == 1)
+    return *Slices[0];
+
+  SliceInfo Result{SliceStatus::makeEmpty()};
+  auto AddResultStatusBit = [&Result](SliceStatus StatusBit) {
+    assert(StatusBit.isSingleStatus());
+    Result.Status = (Result.Status | StatusBit);
+  };
+
+  // Set constant if there is a consistent one
+  {
+    std::optional<uint32_t> OptConstantValue;
+    for (const SliceInfo *Slice : Slices) {
+      if (Slice->Status.contains(SliceStatus::Constant)) {
+        if (!OptConstantValue.has_value()) {
+          // we are the first to require a specific constant.
+          OptConstantValue = Slice->ConstantValue;
+        } else {
+          // there already is a value. check for consistency.
+          if (OptConstantValue.value() != Slice->ConstantValue) {
+            // conflict.
+            return std::nullopt;
+          }
+        }
+      }
+    }
+    if (OptConstantValue.has_value()) {
+      AddResultStatusBit(SliceStatus::Constant);
+      Result.ConstantValue = OptConstantValue.value();
+    }
+  }
+
+  // Set dynamic info if there is a consistent one
+  {
+    struct DynInfo {
+      Value *V;
+      unsigned Offset;
+    };
+    std::optional<DynInfo> OptDynInfo;
+    for (const SliceInfo *Slice : Slices) {
+      if (Slice->Status.contains(SliceStatus::Dynamic)) {
+        DynInfo CurDynInfo = {Slice->DynamicValue, Slice->DynamicValueByteOffset};
+        if (!OptDynInfo.has_value()) {
+          // we are the first to require a specific constant.
+          OptDynInfo = CurDynInfo;
+        } else {
+          // there already is a value. check for consistency.
+          if (OptDynInfo.value().V != CurDynInfo.V || OptDynInfo->Offset != CurDynInfo.Offset) {
+            // conflict.
+            return std::nullopt;
+          }
+        }
+      }
+    }
+    if (OptDynInfo.has_value()) {
+      AddResultStatusBit(SliceStatus::Dynamic);
+      Result.DynamicValue = OptDynInfo->V;
+      Result.DynamicValueByteOffset = OptDynInfo->Offset;
+    }
+  }
+
+  // Check for UndefOrPoison
+  if (std::any_of(Slices.begin(), Slices.end(),
+                  [](const SliceInfo *Slice) { return Slice->Status.contains(SliceStatus::UndefOrPoison); }))
+    AddResultStatusBit(SliceStatus::UndefOrPoison);
+  return Result;
+}
+
+} // namespace
+
+// Helper class to create ValueInfos
+struct ValueOriginTracker::ValueInfoBuilder {
+  ValueInfoBuilder(const DataLayout &DL, Value *V, unsigned BytesPerSlice, unsigned MaxBytesPerValue)
+      : V{V}, BytesPerSlice{BytesPerSlice}, MaxBytesPerValue{MaxBytesPerValue},
+        NumBits{static_cast<unsigned>(DL.getTypeSizeInBits(V->getType()).getFixedValue())},
+        NumBytes{divideCeil(NumBits, 8)}, NumSlices{
+                                              llvm::divideCeil(std::min(NumBytes, MaxBytesPerValue), BytesPerSlice)} {}
+
+  Value *V = nullptr; // The value for which we are building a ValueInfo
+  unsigned BytesPerSlice = 0;
+  unsigned MaxBytesPerValue;
+  unsigned NumBits = 0;
+  unsigned NumBytes = 0;
+  unsigned NumSlices = 0;
+
+  // In cases where we can't reason about a slice, we use a dynamic self-referencing slice.
+  SliceInfo getDynamicSlice(unsigned SliceIdx) const {
+    SliceInfo SI{SliceStatus::Dynamic};
+    SI.DynamicValue = V;
+    SI.DynamicValueByteOffset = BytesPerSlice * SliceIdx;
+    return SI;
+  }
+
+  ValueInfo createUndef() const {
+    SliceInfo SI{SliceStatus::UndefOrPoison};
+    ValueInfo Result{};
+    Result.Slices.resize(NumSlices, SI);
+    return Result;
+  }
+
+  // Creates a value info for a value that has the given constant on every slice.
+  ValueInfo createUniformConstant(uint32_t UniformConstantValue) const {
+    SliceInfo SI{SliceStatus::Constant};
+    SI.ConstantValue = UniformConstantValue;
+    ValueInfo Result{};
+    Result.Slices.reserve(NumSlices);
+
+    unsigned BitsPerSlice = 8 * BytesPerSlice;
+    unsigned NumRemainingBits = NumBits;
+
+    for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) {
+      if (NumRemainingBits < BitsPerSlice) {
+        // For the last slice, zero out the upper dead bits. This isn't required by the interface,
+        // but is simple and leads to nicer tests.
+        assert(SliceIdx + 1 == NumSlices);
+        SI.ConstantValue &= (~0u >> (BitsPerSlice - NumRemainingBits));
+        Result.Slices.push_back(SI);
+        break;
+      }
+      Result.Slices.push_back(SI);
+      NumRemainingBits -= BitsPerSlice;
+    }
+    return Result;
+  }
+
+  // Given KnownBits about the value, return a value info that uses constant slices where possible,
+  // and fall back to dynamic slices if necessary.
+  // This may be required for slices where not all bits are known.
+  ValueInfo createConstant(const KnownBits &KB) const {
+    assert(KB.One.getBitWidth() == NumBits);
+    ValueInfo Result{};
+    Result.Slices.reserve(NumSlices);
+    unsigned BitsPerSlice = 8 * BytesPerSlice;
+    unsigned SliceMask = ~0u >> (8 * (4 - BytesPerSlice));
+    unsigned NumRemainingBits = NumBits;
+    auto GetSliceFromAPInt = [&](const APInt &AI, unsigned SliceIdx) -> uint32_t {
+      assert(BytesPerSlice <= 4);
+      unsigned DWIdx = (BytesPerSlice * SliceIdx) / 4;
+      unsigned ByteOffsetInDW = (BytesPerSlice * SliceIdx) % 4;
+      unsigned QWIdx = DWIdx / 2;
+      assert(QWIdx < AI.getNumWords());
+      auto QW = AI.getRawData()[QWIdx];
+      if (DWIdx % 2) {
+        QW >>= 32;
+      }
+      QW >>= (8 * ByteOffsetInDW);
+      return QW & SliceMask;
+    };
+    for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) {
+      auto One = GetSliceFromAPInt(KB.One, SliceIdx);
+      auto Zero = GetSliceFromAPInt(KB.Zero, SliceIdx);
+      if (NumRemainingBits < BitsPerSlice) {
+        // For the last slice, accept a partial known mask, because the tail bits are dead
+        // and not analyzed by KnownBits
+        SliceMask >>= (BitsPerSlice - NumRemainingBits);
+        assert(SliceIdx + 1 == NumSlices);
+      }
+      if ((One | Zero) == SliceMask) {
+        SliceInfo SI{SliceStatus::Constant};
+        SI.ConstantValue = One;
+        Result.Slices.push_back(SI);
+      } else {
+        // There are unknown bits. Give up on this slice.
+        Result.Slices.push_back(getDynamicSlice(SliceIdx));
+      }
+      NumRemainingBits -= BitsPerSlice;
+    }
+    return Result;
+  }
+
+  // Return a value info that just refers to the value itself on every slice. This can always be used as fallback.
+  ValueInfo createDynamic() const {
+    ValueInfo Result{};
+    Result.Slices.reserve(NumSlices);
+    for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx)
+      Result.Slices.push_back(getDynamicSlice(SliceIdx));
+
+    return Result;
+  }
+
+  // Obtain the value info for a sub-range of slices.
+  ValueInfo createExtraction(const ValueInfo &AggInfo, unsigned ByteOffset) const {
+    // Note that NumBytes might not be a multiple of slices, and thus
+    // the last slice of Result might cover data outside of our value.
+    // But that should be fine, we might just be a bit pessimistic.
+    if (ByteOffset % BytesPerSlice) {
+      LLVM_DEBUG(dbgs() << "Non-aligned extract " << *V << ", giving up.\n");
+      return createDynamic();
+    }
+    ValueInfo Result{};
+    unsigned BeginSlice = ByteOffset / BytesPerSlice;
+    unsigned ResultNumSlices = NumSlices;
+    if (BeginSlice < AggInfo.Slices.size()) {
+      ResultNumSlices = std::min(NumSlices, static_cast<unsigned>(AggInfo.Slices.size() - BeginSlice));
+      Result.Slices.append(AggInfo.Slices.begin() + BeginSlice, AggInfo.Slices.begin() + BeginSlice + NumSlices);
+    }
+    // Fill up with fallback if necessary
+    for (unsigned SliceIdx = Result.Slices.size(); SliceIdx < ResultNumSlices; ++SliceIdx)
+      Result.Slices.push_back(getDynamicSlice(SliceIdx));
+
+    assert(Result.Slices.size() == ResultNumSlices && ResultNumSlices <= NumSlices);
+
+    return Result;
+  }
+
+  // Computes a ValueInfo that is obtained by inserting a value at the given byte offset and size
+  // into this value, e.g. in insert{value, element}.
+  ValueInfo createInsertion(const ValueInfo &Agg, const ValueInfo &Inserted, unsigned ByteOffset,
+                            unsigned InsertedByteCount) const {
+    ValueInfo Result = Agg;
+    unsigned SliceBegin = ByteOffset / BytesPerSlice;
+    unsigned SliceEnd =
+        std::min<unsigned>(divideCeil(ByteOffset + InsertedByteCount, BytesPerSlice), Result.Slices.size());
+    if (ByteOffset % BytesPerSlice) {
+      LLVM_DEBUG(dbgs() << "Insertion with non-aligned offset: " << *V << "\n");
+      // We don't support merging misaligned slices. Use the fallback for all affected slices.
+      for (unsigned SliceIdx = SliceBegin; SliceIdx < SliceEnd; ++SliceIdx)
+        Result.Slices[SliceIdx] = getDynamicSlice(SliceIdx);
+
+      assert(Result.Slices.size() == NumSlices);
+      return Result;
+    }
+    for (unsigned SliceIdx = SliceBegin; SliceIdx < SliceEnd; ++SliceIdx) {
+      unsigned OtherSliceIdx = SliceIdx - SliceBegin;
+      if (OtherSliceIdx < Inserted.Slices.size())
+        Result.Slices[SliceIdx] = Inserted.Slices[OtherSliceIdx];
+      else
+        Result.Slices.push_back(getDynamicSlice(SliceIdx));
+    }
+    if (InsertedByteCount % BytesPerSlice && SliceBegin < SliceEnd) {
+      LLVM_DEBUG(dbgs() << "Insertion with non-aligned size " << *V << "\n");
+      // The last slice is only partially replaced.
+      // We don't yet support merging partial slices
+      Result.Slices[SliceEnd - 1] = getDynamicSlice(SliceEnd - 1);
+    }
+
+    assert(Result.Slices.size() == NumSlices);
+    return Result;
+  }
+
+  // Create a value info for a value that is obtained by selecting one of the given values,
+  // e.g. in a phi or select instruction.
+  ValueInfo createSelect(ArrayRef<const ValueInfo *> ValueInfos) {
+    if (ValueInfos.empty())
+      return createDynamic();
+    if (ValueInfos.size() == 1)
+      return *ValueInfos[0];
+    SmallVector<const SliceInfo *> SliceInfos;
+    SliceInfos.reserve(ValueInfos.size());
+    bool Stop = false;
+    ValueInfo Result;
+    Result.Slices.reserve(ValueInfos[0]->Slices.size());
+    for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) {
+      SliceInfos.clear();
+      for (const ValueInfo *ValueInfo : ValueInfos) {
+        if (SliceIdx < ValueInfo->Slices.size()) {
+          SliceInfos.push_back(&ValueInfo->Slices[SliceIdx]);
+        } else {
+          // Give up on this and higher slices
+          Stop = true;
+          break;
+        }
+      }
+      if (Stop)
+        break;
+      std::optional<SliceInfo> OptSliceInfo = combineSliceInfosForSelect(SliceInfos);
+      if (OptSliceInfo.has_value()) {
+        // We succeeded in combining the slices
+        Result.Slices.push_back(OptSliceInfo.value());
+      } else {
+        // Create dynamic slice
+        SliceInfo SI{SliceStatus::Dynamic};
+        SI.DynamicValue = V;
+        SI.DynamicValueByteOffset = BytesPerSlice * SliceIdx;
+        Result.Slices.push_back(SI);
+      }
+    }
+    return Result;
+  }
+
+  // For each slice, the assumption either gives us constant/undef values, or references
+  // other dynamic values. ReferencedInfos is indexed by slices and gives value infos for these
+  // referenced dynamic values.
+  // This function then combines all these infos accordingly.
+  ValueInfo createFromAssumption(const ValueInfo &Assumption, ArrayRef<const ValueInfo *> ReferencedInfos) {
+    ValueInfo Result;
+    assert(Assumption.Slices.size() == ReferencedInfos.size());
+    for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) {
+      if (SliceIdx >= Assumption.Slices.size()) {
+        // If slices are missing in the assumption, use the dynamic fallback
+        Result.Slices.push_back(getDynamicSlice(SliceIdx));
+        continue;
+      }
+      // Start with the assumption, then merge with the referenced info if applicable.
+      // For non-dynamic assumptions, we just use the assumption directly.
+      SliceInfo AssumptionSI = Assumption.Slices[SliceIdx];
+      assert(AssumptionSI.Status.isSingleStatus());
+      if (!AssumptionSI.Status.contains(SliceStatus::Dynamic)) {
+        Result.Slices.push_back(AssumptionSI);
+        continue;
+      }
+      // No multi-status assumptions are allowed, this would require merging constants here
+      assert(AssumptionSI.Status == SliceStatus::Dynamic);
+      const ValueInfo *ReferencedInfo = ReferencedInfos[SliceIdx];
+      if (ReferencedInfo != nullptr) {
+
+        if (AssumptionSI.DynamicValueByteOffset % BytesPerSlice) {
+          // Misaligned assumption, give up on this slice
+          Result.Slices.push_back(getDynamicSlice(SliceIdx));
+          continue;
+        }
+        unsigned ReferencedSliceIdx = AssumptionSI.DynamicValueByteOffset / BytesPerSlice;
+        if (ReferencedSliceIdx >= ReferencedInfo->Slices.size()) {
+          // No referenced slice available
+          Result.Slices.push_back(getDynamicSlice(SliceIdx));
+          continue;
+        }
+        // The assumption references an existing slice info. Use that one.
+        Result.Slices.push_back(ReferencedInfo->Slices[ReferencedSliceIdx]);
+      } else {
+        // Missing reference infos are only allowed for self-references
+        assert(AssumptionSI.DynamicValue == V);
+        Result.Slices.push_back(getDynamicSlice(SliceIdx));
+      }
+    }
+    assert(Result.Slices.size() == NumSlices);
+    return Result;
+  }
+};
+
+// Implement status printing also here, because for multi-bit status we want to interleave the printing
+// with the referenced values.
+void SliceInfo::print(llvm::raw_ostream &OS, bool Compact) const {
+  bool IsFirst = true;
+  auto Sep = Compact ? "|" : " | ";
+  if (!Status.isSingleStatus())
+    OS << "(";
+  if (Status.contains(SliceStatus::UndefOrPoison)) {
+    if (!IsFirst)
+      OS << Sep;
+    IsFirst = false;
+    OS << (Compact ? "U" : "UndefOrPoison");
+  }
+  if (Status.contains(SliceStatus::Constant)) {
+    if (!IsFirst)
+      OS << Sep;
+    IsFirst = false;
+    if (Compact) {
+      OS << "C";
+    } else {
+      OS << "Constant: 0x";
+      OS.write_hex(ConstantValue);
+    }
+  }
+  if (Status.contains(SliceStatus::Dynamic)) {
+    if (!IsFirst)
+      OS << Sep;
+    IsFirst = false;
+    bool IsArg = isa<Argument>(DynamicValue);
+    if (Compact) {
+      OS << (IsArg ? "A" : "D");
+    } else {
+      OS << "Dynamic" << (IsArg ? " (argument): " : ": ") << *DynamicValue << " (offset " << DynamicValueByteOffset
+         << ")";
+    }
+  }
+  if (!Status.isSingleStatus())
+    OS << ")";
+}
+
+llvm::raw_ostream &CompilerUtils::ValueTracking::operator<<(llvm::raw_ostream &OS, const SliceInfo &SI) {
+  SI.print(OS);
+  return OS;
+}
+
+void ValueTracking::ValueInfo::print(llvm::raw_ostream &OS, bool Compact) const {
+  if (Compact) {
+    for (const auto &Slice : Slices) {
+      Slice.print(OS, true);
+    }
+  } else {
+    for (const auto &[Idx, Slice] : enumerate(Slices)) {
+      if (Idx)
+        OS << "; ";
+      OS << Slice;
+    }
+  }
+}
+
+llvm::raw_ostream &CompilerUtils::ValueTracking::operator<<(llvm::raw_ostream &OS, const ValueInfo &VI) {
+  VI.print(OS);
+  return OS;
+}
+
+ValueInfo ValueOriginTracker::computeConstantValueInfo(ValueInfoBuilder &VIB, llvm::Constant *CV) {
+  if (CV->isNullValue())
+    return VIB.createUniformConstant(0);
+
+  // Don't bother with globals we can't reason about
+  if (isa<Function>(CV) || isa<GlobalVariable>(CV) || isa<PointerType>(CV->getType()))
+    return VIB.createDynamic();
+
+  auto Ty = CV->getType();
+  unsigned BitsPerSlice = 8 * BytesPerSlice;
+  // Don't bother with dynamic vectors
+  auto *VectorTy = dyn_cast<FixedVectorType>(Ty);
+  auto *ArrayTy = dyn_cast<ArrayType>(Ty);
+  Type *ElemTy = nullptr;
+  unsigned NumElements = 0;
+  if (VectorTy) {
+    ElemTy = VectorTy->getElementType();
+    NumElements = VectorTy->getNumElements();
+  } else if (ArrayTy) {
+    ElemTy = ArrayTy->getElementType();
+    NumElements = ArrayTy->getNumElements();
+  }
+
+  // For integer constants, FP constants, and vector-of-integer constants, use computeKnownBits.
+  // It does not support vector of FP, or arrays.
+  if (isa<ConstantInt>(CV) || isa<ConstantFP>(CV) || (VectorTy && ElemTy->isIntegerTy())) {
+    // computeKnownBits only supports integers and integer vector types.
+    // For vector types, it returns common known bits merged across all elements, as wide as single
+    // element, instead of known bits of the whole value. Thus, cast non-integers to integers first.
+    Value *ToBeAnalyzed = CV;
+    if (!CV->getType()->isIntegerTy()) {
+      unsigned NumBits = DL.getTypeSizeInBits(CV->getType());
+      llvm::Type *IntTy = IntegerType::get(CV->getContext(), NumBits);
+      ToBeAnalyzed = ConstantExpr::getBitCast(CV, IntTy);
+    }
+    auto KnownBits = computeKnownBits(ToBeAnalyzed, DL, 2);
+    return VIB.createConstant(KnownBits);
+  }
+
+  // The remainder of this function deals with arrays and vectors only.
+  if (VectorTy == nullptr && ArrayTy == nullptr)
+    return VIB.createDynamic();
+
+  auto *ConstDataSeq = dyn_cast<ConstantDataSequential>(CV);
+  auto *ConstArr = dyn_cast<ConstantArray>(CV);
+  auto *ConstVec = dyn_cast<ConstantVector>(CV);
+  assert(ConstDataSeq == nullptr || ConstDataSeq->getNumElements() == NumElements);
+  assert(ConstArr == nullptr || ConstArr->getNumOperands() == NumElements);
+  assert(ConstVec == nullptr || ConstVec->getNumOperands() == NumElements);
+
+  if (ConstDataSeq != nullptr || ConstArr != nullptr || ConstVec != nullptr) {
+    // Array or vector. Try to concatenate the elements infos if possible.
+    // This is possible if element sizes are slice-aligned, and no padding needs to be considered.
+    // We could maybe extend the below to structs, but that's even more complicated because
+    // we need to account for padding on every element, and there can be nested structs, so ignore them for now.
+    unsigned BitsPerElement = ElemTy->getPrimitiveSizeInBits();
+    unsigned AlignedBitsPerElement = VectorTy ? BitsPerElement : 8 * DL.getTypeAllocSize(ElemTy).getFixedValue();
+    if (BitsPerElement != AlignedBitsPerElement || BitsPerElement % BitsPerSlice != 0)
+      return VIB.createDynamic();
+
+    // Handle constant vector of values whose sizes are integer-multiples of the slice size,
+    // so we can just concatenate slices element-wise
+    unsigned SlicesPerElement = BitsPerElement / BitsPerSlice;
+    ValueInfo Result;
+    Result.Slices.reserve(SlicesPerElement * NumElements);
+    for (unsigned ElemIdx = 0; ElemIdx < NumElements; ++ElemIdx) {
+      // Accessing the element as constant is slightly less efficient, but allows to use the
+      // computeKnownBits() machinery to obtain bit layouts of floats
+      llvm::Constant *ElemAsConstant = nullptr;
+      if (ConstDataSeq) {
+        ElemAsConstant = ConstDataSeq->getElementAsConstant(ElemIdx);
+      } else if (ConstArr) {
+        ElemAsConstant = ConstArr->getOperand(ElemIdx);
+      } else {
+        assert(ConstVec != nullptr);
+        ElemAsConstant = ConstVec->getOperand(ElemIdx);
+      }
+      const auto &ValueInfo = getOrComputeValueInfo(ElemAsConstant);
+      Result.Slices.append(ValueInfo.Slices);
+    }
+    return Result;
+  }
+
+  return VIB.createDynamic();
+}
+
+ValueInfo ValueOriginTracker::computeValueInfoFromAssumption(ValueInfoBuilder &VIB, const ValueInfo &OriginAssumption) {
+  SmallVector<const ValueInfo *> ReferencedValueInfos;
+  ReferencedValueInfos.reserve(OriginAssumption.Slices.size());
+  for (const auto &AssumptionSliceInfo : OriginAssumption.Slices) {
+    const ValueInfo *ReferencedValueInfo = nullptr;
+    if (AssumptionSliceInfo.DynamicValue) {
+      if (AssumptionSliceInfo.DynamicValue != VIB.V) {
+        auto ReferencedIt = ValueInfos.find(AssumptionSliceInfo.DynamicValue);
+        assert(ReferencedIt != ValueInfos.end());
+        ReferencedValueInfo = &ReferencedIt->second;
+      } else {
+        // The assumption on this slice is trivial, referring to the value itself.
+        // Leave the nullptr as-is, and handle it in createFromAssumption
+      }
+    }
+    ReferencedValueInfos.push_back(ReferencedValueInfo);
+  }
+  return VIB.createFromAssumption(OriginAssumption, ReferencedValueInfos);
+}
+
+// Analyze a value, creating a ValueInfo for it.
+// If V is an instruction, this asserts the ValueInfos of dependencies have already been created.
+// An exception are PHI nodes: We only support propagation in a single pass, and thus handle loops conservatively,
+// treating dependencies on earlier loop iterations as dynamic. Thus, for PHI nodes, if dependencies have not yet
+// been analyzed, we assume loop dependencies and give up.
+ValueInfo ValueOriginTracker::computeValueInfo(llvm::Value *V) {
+  ValueInfoBuilder VIB{DL, V, BytesPerSlice, MaxBytesPerValue};
+  if (isa<UndefValue>(V)) {
+    return VIB.createUndef();
+  }
+  if (auto *CV = dyn_cast<llvm::Constant>(V))
+    return computeConstantValueInfo(VIB, CV);
+
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return VIB.createDynamic();
+
+  auto OriginAssumptionIt = OriginAssumptions.find(Inst);
+  if (OriginAssumptionIt != OriginAssumptions.end()) {
+    // There is an origin assumption on this instruction. Collect and combine the value infos of referenced values.
+    // Note: This does not combine with an analysis of V that we would have done without an assumption.
+    // This can be pessimistic if there are assumptions on values we can analyze, but for now
+    // this suffices as we only plan to add assumptions on values that are otherwise completely opaque.
+    return computeValueInfoFromAssumption(VIB, OriginAssumptionIt->second);
+  }
+
+  switch (Inst->getOpcode()) {
+  case Instruction::AddrSpaceCast:
+  case Instruction::BitCast:
+  case Instruction::Freeze: {
+    // Just forward the operand for size-preserving type conversions and freeze
+    auto *Op = Inst->getOperand(0);
+    auto It = ValueInfos.find(Op);
+    assert(It != ValueInfos.end());
+    return It->second;
+  }
+  case Instruction::ExtractElement: {
+    auto *EE = cast<ExtractElementInst>(Inst);
+    auto *Vec = EE->getVectorOperand();
+    auto *IndexArg = EE->getIndexOperand();
+
+    std::optional<unsigned> Offset = computeByteOffsetInVector(Vec->getType(), IndexArg, DL);
+    if (!Offset.has_value())
+      return VIB.createDynamic();
+
+    // Obtain ValueInfo for the source aggregate
+    auto It = ValueInfos.find(Vec);
+    assert(It != ValueInfos.end());
+    const ValueInfo &SrcInfo = It->second;
+
+    // Extract extracted slices
+    return VIB.createExtraction(SrcInfo, *Offset);
+  }
+  case Instruction::ExtractValue: {
+    auto *EV = cast<ExtractValueInst>(Inst);
+    auto *Src = EV->getAggregateOperand();
+
+    unsigned Offset = computeByteOffsetInAggregate(Src->getType(), EV->getIndices(), DL);
+
+    // Obtain ValueInfo for the source aggregate
+    auto It = ValueInfos.find(Src);
+    assert(It != ValueInfos.end());
+    const ValueInfo &SrcInfo = It->second;
+
+    // Extract extracted slices
+    return VIB.createExtraction(SrcInfo, Offset);
+  }
+  case Instruction::InsertElement: {
+    // TODO: Support shufflevector
+    auto *IE = cast<InsertElementInst>(Inst);
+    auto *Vec = IE->getOperand(0);
+    auto *Inserted = IE->getOperand(1);
+    auto *IndexArg = IE->getOperand(2);
+
+    std::optional<unsigned> Offset = computeByteOffsetInVector(Vec->getType(), IndexArg, DL);
+    if (!Offset.has_value())
+      return VIB.createDynamic();
+
+    auto VecIt = ValueInfos.find(Vec);
+    auto InsertedIt = ValueInfos.find(Inserted);
+    assert(VecIt != ValueInfos.end() && InsertedIt != ValueInfos.end());
+    const auto &VecInfo = VecIt->second;
+    const auto &InsertedInfo = InsertedIt->second;
+    unsigned NumInsertedBits = Inserted->getType()->getPrimitiveSizeInBits();
+    assert(NumInsertedBits % 8 == 0 && NumInsertedBits == 8 * DL.getTypeStoreSize(Inserted->getType()));
+    unsigned NumInsertedBytes = NumInsertedBits / 8;
+
+    // Combine AggInfo and InsertedInfo
+    return VIB.createInsertion(VecInfo, InsertedInfo, *Offset, NumInsertedBytes);
+  }
+  case Instruction::InsertValue: {
+    auto *IV = cast<InsertValueInst>(Inst);
+    auto *Agg = IV->getAggregateOperand();
+    auto *Inserted = IV->getInsertedValueOperand();
+    auto AggIt = ValueInfos.find(Agg);
+    auto InsertedIt = ValueInfos.find(Inserted);
+    assert(AggIt != ValueInfos.end() && InsertedIt != ValueInfos.end());
+
+    const auto &AggInfo = AggIt->second;
+    const auto &InsertedInfo = InsertedIt->second;
+
+    unsigned Offset = computeByteOffsetInAggregate(Agg->getType(), IV->getIndices(), DL);
+    unsigned NumInsertedBytes = DL.getTypeStoreSize(Inserted->getType());
+
+    // Combine AggInfo and InsertedInfo
+    return VIB.createInsertion(AggInfo, InsertedInfo, Offset, NumInsertedBytes);
+  }
+  case Instruction::PHI: {
+    auto *PN = cast<PHINode>(Inst);
+    SmallVector<const ValueInfo *, 2> ArgValueInfos;
+    for (Value *Val : PN->incoming_values()) {
+      auto It = ValueInfos.find(Val);
+      if (It == ValueInfos.end()) {
+        // The incoming value has not been analyzed yet.
+        // This can be caused by a loop, which we currently don't support.
+        // We could repeatedly propagate through the loop until a stable state is reached.
+        return VIB.createDynamic();
+      }
+      ArgValueInfos.push_back(&It->second);
+    }
+    return VIB.createSelect(ArgValueInfos);
+  }
+  case Instruction::Select: {
+    auto *SI = cast<SelectInst>(Inst);
+    auto *TrueVal = SI->getTrueValue();
+    auto *FalseVal = SI->getFalseValue();
+    auto TrueIt = ValueInfos.find(TrueVal);
+    auto FalseIt = ValueInfos.find(FalseVal);
+    assert(TrueIt != ValueInfos.end() && FalseIt != ValueInfos.end());
+
+    const auto &TrueInfo = TrueIt->second;
+    const auto &FalseInfo = FalseIt->second;
+
+    return VIB.createSelect({&TrueInfo, &FalseInfo});
+  }
+  // For these instructions, don't waste time trying to compute known bits
+  case Instruction::Call:
+  case Instruction::GetElementPtr:
+  case Instruction::Load:
+  case Instruction::PtrToInt: // PtrToInt and IntToPtr could be supported, but modeling the trunc/zext
+  case Instruction::IntToPtr: //  part is annoying, and we don't need it now.
+  case Instruction::Store: {
+    return VIB.createDynamic();
+  }
+  default: {
+    // As last option, try to use computeKnownBits if possible.
+    // computeKnownBits also supports vector type, but in that case returns bits common bits of all elements.
+    // We are however interested in bits of the whole value. Working on the full vector would require a bitcast
+    // to an integer, but we don't wan't to add instructions in the analysis.
+    if (V->getType()->isIntegerTy()) {
+      auto KnownBits = computeKnownBits(V, DL);
+      return VIB.createConstant(KnownBits);
+    }
+    return VIB.createDynamic();
+  }
+  }
+  llvm_unreachable("unexpected case");
+}
+
+ValueInfo &ValueOriginTracker::getOrComputeValueInfo(llvm::Value *V, bool KnownToBeNew) {
+  if (!KnownToBeNew) {
+    auto It = ValueInfos.find(V);
+    if (It != ValueInfos.end())
+      return It->second;
+  }
+  auto InsertionResult = ValueInfos.insert({V, computeValueInfo(V)});
+  assert(InsertionResult.second);
+  return InsertionResult.first->second;
+}
+
+ValueInfo ValueOriginTracker::getValueInfo(llvm::Value *V) {
+  analyzeValues(V);
+  assert(ValueInfos.contains(V));
+  return ValueInfos[V];
+}
+
+void ValueOriginTracker::analyzeValues(ArrayRef<Value *> Values) {
+  SmallVector<Instruction *> WorkList;
+  SetVector<Function *> PendingFunctions;
+  DenseSet<BasicBlock *> PendingBBs;
+  DenseSet<Instruction *> PendingInstructions;
+
+  // Collect all values that the passed values depend on, by working through
+  // all operands. Instructions are marked in PendingInstructions for later
+  // processing, other values are directly processed.
+
+  auto AddToWorkList = [&](Value *V) {
+    if (ValueInfos.contains(V)) {
+      // Already analyzed, nothing to do
+      return;
+    }
+    if (auto *Inst = dyn_cast<Instruction>(V)) {
+      bool Inserted = PendingInstructions.insert(Inst).second;
+      if (Inserted) {
+        WorkList.push_back(Inst);
+        if (PendingBBs.insert(Inst->getParent()).second)
+          PendingFunctions.insert(Inst->getFunction());
+      }
+    } else {
+      // With general value assumptions, we'd need to add something here to ensure processing of dependencies.
+      static_assert(std::is_same_v<ValueOriginAssumptions::key_type, Instruction *>);
+      getOrComputeValueInfo(V, true);
+    }
+  };
+
+  for (auto *V : Values)
+    AddToWorkList(V);
+
+  while (!WorkList.empty()) {
+    // Add instruction operands to the work list
+    auto *Inst = WorkList.pop_back_val();
+    for (auto &Op : Inst->operands())
+      AddToWorkList(Op);
+
+    // Add any instructions referenced by origin assumptions to the work list as well
+    auto OriginAssumptionIt = OriginAssumptions.find(Inst);
+    if (OriginAssumptionIt != OriginAssumptions.end()) {
+      const ValueInfo &VI = OriginAssumptionIt->second;
+      for (const SliceInfo &SI : VI.Slices) {
+        if (SI.DynamicValue)
+          AddToWorkList(SI.DynamicValue);
+      }
+    }
+  }
+
+  for (auto *F : PendingFunctions) {
+    // Traverse BBs of the function in RPO order.
+    // This ensures instruction dependencies are analyzed before depending instructions, except for loops.
+    ReversePostOrderTraversal<Function *> RPOT(F);
+    for (auto &BB : RPOT) {
+      if (!PendingBBs.contains(BB))
+        continue;
+      for (auto &Inst : *BB) {
+        bool WasPending = PendingInstructions.erase(&Inst);
+        if (WasPending)
+          getOrComputeValueInfo(&Inst, true);
+      }
+    }
+  }
+}
+
+} // namespace CompilerUtils
diff --git a/compilerutils/lib/ValueOriginTrackingTestPass.cpp b/compilerutils/lib/ValueOriginTrackingTestPass.cpp
new file mode 100644
index 0000000000..ff682f5d2c
--- /dev/null
+++ b/compilerutils/lib/ValueOriginTrackingTestPass.cpp
@@ -0,0 +1,133 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#include "ValueOriginTrackingTestPass.h"
+#include "compilerutils/CompilerUtils.h"
+#include "compilerutils/ValueOriginTracking.h"
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/raw_ostream.h>
+
+using namespace llvm;
+using namespace CompilerUtils;
+
+namespace {
+
+cl::opt<unsigned> BytesPerSliceOption("value-origin-tracking-test-bytes-per-slice", cl::init(4));
+cl::opt<unsigned> MaxBytesPerValueOption("value-origin-tracking-test-max-bytes-per-value", cl::init(512));
+
+// Parse assumptions made via calls to the assume function.
+ValueOriginTracker::ValueOriginAssumptions parseAssumptions(Module &Module, Function &AssumeFunc) {
+  ValueOriginTracker::ValueOriginAssumptions Result;
+  forEachCall(AssumeFunc, [&](CallInst &AssumptionCall) {
+    unsigned NumArgs = AssumptionCall.arg_size();
+    // We expect one arg for the value, and two per slice.
+    if (NumArgs % 2 != 1)
+      report_fatal_error("unexpected number of assumption args");
+    // The value we put an assumption on
+    Value *V = AssumptionCall.getArgOperand(0);
+    Instruction *Inst = dyn_cast<Instruction>(V);
+    if (Inst == nullptr)
+      report_fatal_error("assumptions are only allowed on instructions");
+    ValueOriginTracker::ValueInfo Assumption{};
+    unsigned NumSlices = (NumArgs - 1) / 2;
+    for (unsigned SliceIdx = 0; SliceIdx < NumSlices; ++SliceIdx) {
+      unsigned SliceArgBeginIdx = 1 + 2 * SliceIdx;
+      Value *ReferencedValueOrConstant = AssumptionCall.getArgOperand(SliceArgBeginIdx);
+      if (isa<UndefValue>(ReferencedValueOrConstant)) {
+        Assumption.Slices.push_back({ValueTracking::SliceStatus::UndefOrPoison});
+      } else if (auto *CIValue = dyn_cast<ConstantInt>(ReferencedValueOrConstant)) {
+        ValueTracking::SliceInfo SI{ValueTracking::SliceStatus::Constant};
+        if (!CIValue->getType()->isIntegerTy(32))
+          report_fatal_error("expected i32 constant");
+        SI.ConstantValue = CIValue->getZExtValue();
+        Assumption.Slices.push_back(SI);
+      } else {
+        // Dynamic value reference
+        ValueTracking::SliceInfo SI{ValueTracking::SliceStatus::Dynamic};
+        SI.DynamicValue = ReferencedValueOrConstant;
+        Value *DynamicValueByteOffsetValue = AssumptionCall.getArgOperand(SliceArgBeginIdx + 1);
+        auto *DynamicValueByteOffsetValueCI = dyn_cast<ConstantInt>(DynamicValueByteOffsetValue);
+        if (DynamicValueByteOffsetValueCI == nullptr || !DynamicValueByteOffsetValueCI->getType()->isIntegerTy(32))
+          report_fatal_error("expected i32 constant");
+        SI.DynamicValueByteOffset = DynamicValueByteOffsetValueCI->getZExtValue();
+        Assumption.Slices.push_back(SI);
+      }
+    }
+    bool Inserted = Result.insert({Inst, Assumption}).second;
+    if (!Inserted)
+      report_fatal_error("value with duplicate assumption");
+  });
+  return Result;
+}
+
+} // namespace
+
+namespace CompilerUtils {
+
+llvm::PreservedAnalyses ValueOriginTrackingTestPass::run(llvm::Module &Module,
+                                                         llvm::ModuleAnalysisManager &AnalysisManager) {
+  Function *AnalyzeFunc = Module.getFunction("analyze");
+  if (!AnalyzeFunc)
+    return PreservedAnalyses::all();
+
+  ValueOriginTracker::ValueOriginAssumptions Assumptions;
+  Function *AssumeFunc = Module.getFunction("assume");
+  if (AssumeFunc) {
+    Assumptions = parseAssumptions(Module, *AssumeFunc);
+  }
+
+  ValueOriginTracker VOT{Module.getDataLayout(), BytesPerSliceOption.getValue(), MaxBytesPerValueOption.getValue(),
+                         Assumptions};
+
+  auto Prefix = "[VOT]: ";
+
+  // Traverse all functions instead of the users of AnalyzeFunc to group output by function
+  for (auto &F : Module) {
+    if (F.isDeclaration())
+      continue;
+
+    outs() << Prefix << F.getName() << "\n";
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        auto *CI = dyn_cast<CallInst>(&I);
+        if (!CI || CI->getCalledOperand() != AnalyzeFunc) {
+          continue;
+        }
+
+        for (Value *Op : CI->data_ops()) {
+          auto VI = VOT.getValueInfo(Op);
+          outs() << Prefix << "(" << *Op << "): " << VI << "\n";
+        }
+        outs() << "\n";
+      }
+    }
+  }
+  return PreservedAnalyses::all();
+}
+
+} // namespace CompilerUtils
diff --git a/compilerutils/lib/ValueOriginTrackingTestPass.h b/compilerutils/lib/ValueOriginTrackingTestPass.h
new file mode 100644
index 0000000000..3505e0c085
--- /dev/null
+++ b/compilerutils/lib/ValueOriginTrackingTestPass.h
@@ -0,0 +1,42 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#pragma once
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace CompilerUtils {
+
+// Helper pass to enable lit tests of ValueOriginTracker.
+// Calls to a function called "analyze" triggers an analysis its arguments and outputs the analysis to stdout.
+class ValueOriginTrackingTestPass : public llvm::PassInfoMixin<ValueOriginTrackingTestPass> {
+public:
+  llvm::PreservedAnalyses run(llvm::Module &Module, llvm::ModuleAnalysisManager &AnalysisManager);
+
+  static llvm::StringRef name() { return "Test ValueOriginTracking"; }
+};
+
+} // namespace CompilerUtils
diff --git a/compilerutils/lib/ValueSpecialization.cpp b/compilerutils/lib/ValueSpecialization.cpp
new file mode 100644
index 0000000000..4dda822ec0
--- /dev/null
+++ b/compilerutils/lib/ValueSpecialization.cpp
@@ -0,0 +1,358 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#include "compilerutils/ValueSpecialization.h"
+#include "llvm/IR/Module.h"
+#include <cassert>
+
+#define DEBUG_TYPE "value-specialization"
+
+using namespace CompilerUtils;
+using namespace llvm;
+
+namespace CompilerUtils {
+
+namespace {
+
+struct SpecializationSummary {
+  bool AllDwordsAreSpecialized = true;
+  bool AnyDwordIsSpecialized = false;
+};
+static SpecializationSummary
+computeSpecializationSummary(ArrayRef<ValueSpecializer::DwordSpecializationInfo> DwordInfos) {
+  SpecializationSummary Result = {};
+  for (const auto &DWI : DwordInfos) {
+    if (DWI.Kind != ValueSpecializer::SpecializationKind::None)
+      Result.AnyDwordIsSpecialized = true;
+    else
+      Result.AllDwordsAreSpecialized = false;
+  }
+  return Result;
+}
+
+} // namespace
+
+ValueSpecializer::ValueSpecializer(Module &M)
+    : B{M.getContext(), ConstantFolder{},
+        IRBuilderCallbackInserter{[this](Instruction *Inst) { NewInsts.insert(Inst); }}},
+      DL{M.getDataLayout()}, I32{Type::getInt32Ty(M.getContext())}, I64{Type::getInt64Ty(M.getContext())},
+      NumReplacedDwords{}, NewInsts{} {
+}
+
+ValueSpecializer::ReplacementResult
+ValueSpecializer::replaceDwords(Value *Val, ArrayRef<DwordSpecializationInfo> DwordInfos, bool ReplaceUses,
+                                bool PreservePreviousInsertionPoint, StringRef NameSuffix) {
+  assert(divideCeil(DL.getTypeStoreSize(Val->getType()), 4) == DwordInfos.size());
+  NewInsts.clear();
+  NumReplacedDwords = 0;
+
+  if (IsFirstCall || !PreservePreviousInsertionPoint) {
+    if (auto *Arg = dyn_cast<Argument>(Val)) {
+      B.SetInsertPoint(Arg->getParent()->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
+    } else {
+      // Insert *after* the given instruction, so we can use it
+      auto *Inst = cast<Instruction>(Val);
+      B.SetInsertPoint(Inst->getInsertionPointAfterDef().value());
+    }
+  }
+  IsFirstCall = false;
+
+  SmallVector<unsigned> Indices;
+  Value *Replacement = replaceDwordsImpl(Val, Indices, Val->getType(), DwordInfos, (Val->getName() + NameSuffix).str());
+  // Should be nullptr if nothing changed
+  assert(Replacement != Val);
+  if (Replacement != nullptr && ReplaceUses) {
+    Val->replaceUsesWithIf(Replacement, [this](Use &U) -> bool { return !NewInsts.contains(U.getUser()); });
+  }
+  return {Replacement, NumReplacedDwords};
+}
+
+Value *ValueSpecializer::replaceDwordsInNonAggregate(Type *Ty, Value *Val, ArrayRef<DwordSpecializationInfo> DwordInfos,
+                                                     StringRef ReplacementName) {
+  assert(!Ty->isAggregateType());
+
+  unsigned NumBytes = DL.getTypeStoreSize(Ty);
+  if (NumBytes % 4) {
+    // Small and misaligned types are not supported for now.
+    // We could support specializing prefixes of large, misaligned types later.
+    return nullptr;
+  }
+  [[maybe_unused]] unsigned NumDwords = NumBytes / 4;
+  assert(DwordInfos.size() == NumDwords);
+
+  if (Ty->isIntegerTy()) {
+    if (Ty->getIntegerBitWidth() < 32)
+      return nullptr;
+    if (Ty->getIntegerBitWidth() == 32) {
+      const DwordSpecializationInfo &DWI = DwordInfos[0];
+      if (DWI.Kind == SpecializationKind::Constant) {
+        ++NumReplacedDwords;
+        return getI32Constant(DWI.ConstantValue);
+      }
+      if (DWI.Kind == SpecializationKind::FrozenPoison) {
+        ++NumReplacedDwords;
+        return getFrozenPoison(Ty);
+      }
+      return nullptr;
+    }
+    if (Ty->getIntegerBitWidth() == 64) {
+      const DwordSpecializationInfo &LowInfo = DwordInfos[0];
+      const DwordSpecializationInfo &HighInfo = DwordInfos[1];
+      SpecializationKind LowKind = LowInfo.Kind;
+      SpecializationKind HighKind = HighInfo.Kind;
+
+      if (LowKind == HighKind) {
+        // This can be handled without a bitwise or.
+        NumReplacedDwords += 2;
+        if (LowKind == SpecializationKind::Constant) {
+          // return a single i64 constant.
+          uint64_t I64Constant = HighInfo.ConstantValue;
+          I64Constant <<= 32;
+          I64Constant |= LowInfo.ConstantValue;
+          return getI64Constant(I64Constant);
+        }
+        assert(LowKind == SpecializationKind::FrozenPoison);
+        return getFrozenPoison(I64);
+      }
+
+      // Create two separate i64s containing the low and high dwords, and OR them together.
+      uint64_t SingleDwordMask = ~(uint32_t{0});
+      Value *LowDword = nullptr;
+      if (LowKind == SpecializationKind::None) {
+        assert(Val);
+        LowDword = B.CreateAnd(Val, SingleDwordMask);
+      } else {
+        ++NumReplacedDwords;
+        if (LowKind == SpecializationKind::Constant) {
+          LowDword = getI64Constant(LowInfo.ConstantValue);
+        } else {
+          assert(LowKind == SpecializationKind::FrozenPoison);
+          LowDword = B.CreateAnd(getFrozenPoison(I64), SingleDwordMask);
+        }
+      }
+
+      Value *HighDword = nullptr;
+      if (HighKind == SpecializationKind::None) {
+        assert(Val);
+        HighDword = B.CreateAnd(Val, SingleDwordMask << 32);
+      } else {
+        ++NumReplacedDwords;
+        if (HighKind == SpecializationKind::Constant) {
+          uint64_t HighDwordConstant = HighInfo.ConstantValue;
+          HighDwordConstant <<= 32;
+          HighDword = getI64Constant(HighDwordConstant);
+        } else {
+          assert(HighKind == SpecializationKind::FrozenPoison);
+          HighDword = B.CreateAnd(getFrozenPoison(I64), SingleDwordMask << 32);
+        }
+      }
+
+      return B.CreateOr(LowDword, HighDword, ReplacementName);
+    }
+
+    // Give up on other types
+    return nullptr;
+  }
+
+  bool IsPointer = Ty->isPointerTy();
+  if (Ty->isFloatingPointTy() || IsPointer) {
+    unsigned BitWidth = 0;
+    if (auto *PtrTy = dyn_cast<PointerType>(Ty))
+      BitWidth = DL.getPointerSizeInBits(PtrTy->getAddressSpace());
+    else
+      BitWidth = Ty->getScalarSizeInBits();
+
+    if (BitWidth < 32)
+      return nullptr;
+
+    // Reduce this to integer specialization
+    Type *IntTy = IntegerType::get(Ty->getContext(), BitWidth);
+    Value *BaseValue = nullptr;
+    if (Val) {
+      // Need to preserve some data, so start with bitcast of original value
+      if (IsPointer)
+        BaseValue = B.CreatePtrToInt(Val, IntTy);
+      else
+        BaseValue = B.CreateBitCast(Val, IntTy);
+    }
+    Value *SpecializedAsInt = replaceDwordsInNonAggregate(IntTy, BaseValue, DwordInfos, {});
+    if (!SpecializedAsInt)
+      return nullptr;
+
+    if (IsPointer)
+      return B.CreateIntToPtr(SpecializedAsInt, Ty, ReplacementName);
+    return B.CreateBitCast(SpecializedAsInt, Ty, ReplacementName);
+  }
+
+  // Last remaining case: vectors.
+  if (isa<ScalableVectorType>(Ty)) {
+    // Not supported.
+    return nullptr;
+  }
+  auto *VTy = cast<FixedVectorType>(Ty);
+  // Similar to the aggregate case: For small elements, give up.
+  // For dword-sized elements, just insert the new value.
+  // For larger elements, extract the value, update it, and insert it again.
+  Type *ElemTy = VTy->getElementType();
+  if (!ElemTy->isIntegerTy() && !ElemTy->isFloatingPointTy()) {
+    // E.g. pointers, not supported. Could add support if necessary.
+    return nullptr;
+  }
+  unsigned NumElems = VTy->getNumElements();
+  unsigned ElemNumBits = ElemTy->getPrimitiveSizeInBits();
+  if (ElemNumBits % 32) {
+    // Give up.
+    return nullptr;
+  }
+  unsigned ElemNumDwords = ElemNumBits / 32;
+
+  // While working on the vector elements, keep track of the current replaced full vector value.
+  Value *ReplacedVector = Val;
+  for (unsigned ElemIdx = 0; ElemIdx < NumElems; ++ElemIdx) {
+    unsigned ElemDwordBegin = ElemIdx * ElemNumDwords;
+    unsigned ElemDwordEnd = ElemDwordBegin + ElemNumDwords;
+    assert(ElemDwordEnd <= DwordInfos.size());
+
+    ArrayRef<DwordSpecializationInfo> ElemDwordInfos{DwordInfos.data() + ElemDwordBegin,
+                                                     DwordInfos.data() + ElemDwordEnd};
+
+    auto Summary = computeSpecializationSummary(ElemDwordInfos);
+    if (!Summary.AnyDwordIsSpecialized) {
+      // Nothing to do on this vector element.
+      assert(Val != nullptr);
+      continue;
+    }
+
+    Value *ElemBaseValue = Summary.AllDwordsAreSpecialized ? nullptr : B.CreateExtractElement(ReplacedVector, ElemIdx);
+    Value *ReplacedElem = replaceDwordsInNonAggregate(ElemTy, ElemBaseValue, ElemDwordInfos, {});
+    if (ReplacedElem) {
+      if (ReplacedVector == nullptr) {
+        // Start with a frozen poison value
+        ReplacedVector = getFrozenPoison(Ty);
+      }
+      ReplacedVector = B.CreateInsertElement(ReplacedVector, ReplacedElem, ElemIdx, ReplacementName);
+    }
+  }
+
+  // Return nullptr if nothing changed.
+  return ReplacedVector != Val ? ReplacedVector : nullptr;
+}
+
+Value *ValueSpecializer::replaceDwordsImpl(Value *RootVal, SmallVectorImpl<unsigned> &Indices, Type *CurTy,
+                                           ArrayRef<DwordSpecializationInfo> DwordInfos, StringRef ReplacementName) {
+  assert(RootVal && CurTy);
+
+  auto Summary = computeSpecializationSummary(DwordInfos);
+  if (!Summary.AnyDwordIsSpecialized) {
+    // Nothing to be done.
+    return nullptr;
+  }
+
+  if (!CurTy->isAggregateType()) {
+    // Base value to perform non-aggregate specialization on. Nullptr if all dwords are replaced.
+    // The called specialization function then creates a base frozen poison value if necessary.
+    // This might not be necessary in some cases, e.g. for a dword-sized value like an i32.
+    Value *BaseValue = nullptr;
+    if (!Summary.AllDwordsAreSpecialized) {
+      if (Indices.empty()) {
+        assert(RootVal->getType() == CurTy);
+        BaseValue = RootVal;
+      } else {
+        // We are part of a (possibly nested) aggregate. Extract our value to work on it.
+        BaseValue = B.CreateExtractValue(RootVal, Indices);
+      }
+    }
+
+    // If the result of this call is going to be the final result, forward the replacement name.
+    // Otherwise, we will create an insertvalue instruction that will get the name.
+    StringRef NestedReplacementName = Indices.empty() ? ReplacementName : "";
+    Value *Replaced = replaceDwordsInNonAggregate(CurTy, BaseValue, DwordInfos, NestedReplacementName);
+    if (!Replaced)
+      return nullptr;
+
+    if (Indices.empty())
+      return Replaced;
+
+    // Insert the replacement into the root value
+    return B.CreateInsertValue(RootVal, Replaced, Indices, ReplacementName);
+  }
+
+  // Final case: Aggregates
+  assert(CurTy->isAggregateType());
+
+  const StructLayout *SL = nullptr;
+  ArrayType *ArrTy = dyn_cast<ArrayType>(CurTy);
+  StructType *STy = dyn_cast<StructType>(CurTy);
+  unsigned NumElements = -1;
+  if (ArrTy) {
+    NumElements = ArrTy->getNumElements();
+  } else {
+    NumElements = STy->getNumElements();
+    SL = DL.getStructLayout(STy);
+  }
+
+  // While working on the aggregate elements, keep track of the current replaced full aggregate value.
+  Value *ReplacedRootVal = RootVal;
+  for (unsigned ElemIdx = 0; ElemIdx < NumElements; ++ElemIdx) {
+    // Determine byte range covered by the element
+    unsigned ElemByteOffset = -1;
+    Type *ElemTy = nullptr;
+    if (ArrTy) {
+      ElemTy = ArrTy->getElementType();
+      unsigned ElemAllocSize = DL.getTypeAllocSize(ElemTy);
+      ElemByteOffset = ElemIdx * ElemAllocSize;
+    } else {
+      ElemTy = STy->getElementType(ElemIdx);
+      ElemByteOffset = SL->getElementOffset(ElemIdx);
+    }
+    unsigned ElemByteSize = DL.getTypeStoreSize(ElemTy);
+
+    if (ElemByteOffset % 4 != 0 || ElemByteSize % 4 != 0) {
+      // Give up on small/misaligned types
+      continue;
+    }
+
+    // The element corresponds to a sub-range of CurDwordInfos. Determine it.
+    unsigned ElemDwordBegin = ElemByteOffset / 4;
+    unsigned ElemNumDwords = ElemByteSize / 4;
+    unsigned ElemDwordEnd = ElemDwordBegin + ElemNumDwords;
+    assert(ElemDwordEnd <= DwordInfos.size());
+
+    ArrayRef<DwordSpecializationInfo> ElemDwordInfos{DwordInfos.data() + ElemDwordBegin,
+                                                     DwordInfos.data() + ElemDwordEnd};
+    Indices.push_back(ElemIdx);
+    Value *Replaced = replaceDwordsImpl(ReplacedRootVal, Indices, ElemTy, ElemDwordInfos, ReplacementName);
+    Indices.pop_back();
+    if (Replaced) {
+      // Replacement was successful. In the next iteration, use Replaced as base value to operate on.
+      ReplacedRootVal = Replaced;
+    }
+  }
+
+  // Return nullptr if nothing changed
+  return ReplacedRootVal != RootVal ? ReplacedRootVal : nullptr;
+}
+
+} // namespace CompilerUtils
diff --git a/compilerutils/lib/ValueSpecializationTestPass.cpp b/compilerutils/lib/ValueSpecializationTestPass.cpp
new file mode 100644
index 0000000000..884ce1733a
--- /dev/null
+++ b/compilerutils/lib/ValueSpecializationTestPass.cpp
@@ -0,0 +1,160 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#include "ValueSpecializationTestPass.h"
+#include "compilerutils/CompilerUtils.h"
+#include "compilerutils/ValueOriginTracking.h"
+#include "compilerutils/ValueSpecialization.h"
+#include <llvm/IR/Module.h>
+#include <llvm/Support/raw_ostream.h>
+
+using namespace llvm;
+using namespace CompilerUtils;
+
+namespace {
+
+enum TestFlags { SkipValueTrackingCheck = 0x1, AllowFailure = 0x2, Invalid = 0x4 };
+
+struct ValueSpecializationInfo {
+  llvm::Value *Val;
+  llvm::SmallVector<ValueSpecializer::DwordSpecializationInfo> DwordInfos;
+  unsigned NumToBeReplacedDwords = 0;
+  TestFlags Flags;
+};
+
+// Syntax:
+//   call @specialize(i32 %flags, <ty> %val, i32 dw0Status, i32 dw0Constant, [i32 dw1Status, i32 dw1Constant, ...])
+ValueSpecializationInfo parseSpecializeCall(llvm::CallInst &CI) {
+  unsigned NumArgs = CI.arg_size();
+  if (NumArgs % 2 != 0)
+    report_fatal_error("Unexpected num args for specialize");
+  unsigned NumDwords = (NumArgs - 2) / 2;
+
+  llvm::SmallVector<ValueSpecializer::DwordSpecializationInfo> DwordInfos;
+  unsigned NumReplacedDwords = 0;
+  DwordInfos.reserve(NumDwords);
+  for (unsigned DwordIdx = 0; DwordIdx < NumDwords; ++DwordIdx) {
+    llvm::Value *KindValue = CI.getArgOperand(2 + 2 * DwordIdx);
+    if (!isa<ConstantInt>(KindValue))
+      report_fatal_error("Unexpected non-integer kind argument");
+    auto KindInt = cast<ConstantInt>(KindValue)->getZExtValue();
+    if (KindInt >= static_cast<uint32_t>(ValueSpecializer::SpecializationKind::Count))
+      report_fatal_error("Invalid specialization kind");
+    auto Kind = static_cast<ValueSpecializer::SpecializationKind>(KindInt);
+    uint32_t Constant = 0;
+    if (Kind == ValueSpecializer::SpecializationKind::Constant) {
+      llvm::Value *ConstantValueValue = CI.getArgOperand(2 + 2 * DwordIdx + 1);
+      if (!isa<ConstantInt>(ConstantValueValue))
+        report_fatal_error("Unexpected non-integer constant value argument");
+      auto ConstantValueInt = cast<ConstantInt>(ConstantValueValue)->getZExtValue();
+      if (ConstantValueInt >= UINT32_MAX)
+        report_fatal_error("Too large constant value");
+      Constant = static_cast<uint32_t>(ConstantValueInt);
+    }
+    DwordInfos.push_back({Kind, Constant});
+    if (Kind != ValueSpecializer::SpecializationKind::None)
+      ++NumReplacedDwords;
+  }
+  Value *TestFlagsValue = CI.getArgOperand(0);
+  if (!isa<ConstantInt>(TestFlagsValue))
+    report_fatal_error("Unexpected non-integer constant value argument");
+  auto TestFlagsInt = cast<ConstantInt>(TestFlagsValue)->getZExtValue();
+  if (TestFlagsInt >= static_cast<uint64_t>(TestFlags::Invalid))
+    report_fatal_error("Invalid test flags value");
+  return {CI.getArgOperand(1), DwordInfos, NumReplacedDwords, static_cast<TestFlags>(TestFlagsInt)};
+}
+
+} // namespace
+
+namespace CompilerUtils {
+
+llvm::PreservedAnalyses ValueSpecializationTestPass::run(llvm::Module &Module,
+                                                         llvm::ModuleAnalysisManager &AnalysisManager) {
+  Function *SpecializeFunc = Module.getFunction("specialize");
+  if (!SpecializeFunc)
+    return PreservedAnalyses::all();
+
+  SmallVector<CallInst *> ToBeDeleted;
+  for (auto &F : Module) {
+    for (auto &BB : F) {
+      // Use one specialize per BB, and re-use insertion points.
+      ValueSpecializer VS(Module);
+
+      for (auto &Inst : BB) {
+        auto *CI = dyn_cast<CallInst>(&Inst);
+        if (!CI || CI->getCalledOperand() != SpecializeFunc) {
+          continue;
+        }
+        ToBeDeleted.push_back(CI);
+
+        ValueSpecializationInfo VSI = parseSpecializeCall(*CI);
+        bool ReplaceUses = true;
+        bool PreserveInsertionPoint = true;
+        const auto [Replacement, NumReplacedDwords] =
+            VS.replaceDwords(VSI.Val, VSI.DwordInfos, ReplaceUses, PreserveInsertionPoint);
+
+        if (!(VSI.Flags & TestFlags::AllowFailure) && NumReplacedDwords != VSI.NumToBeReplacedDwords)
+          report_fatal_error("Less than expected replacements");
+        if (NumReplacedDwords != 0 && Replacement == nullptr)
+          report_fatal_error("Missing replacement result");
+
+        if (Replacement && !(VSI.Flags & TestFlags::SkipValueTrackingCheck)) {
+          // Run value tracking analysis on the replacement result, and check that it matches the requested replacements
+          ValueOriginTracker VOT{Module.getDataLayout(), 4, 256};
+          const ValueTracking::ValueInfo VI = VOT.getValueInfo(Replacement);
+          if (VI.Slices.size() != VSI.DwordInfos.size())
+            report_fatal_error("Size mismatch");
+          for (unsigned DwordIdx = 0; DwordIdx < VI.Slices.size(); ++DwordIdx) {
+            const ValueTracking::SliceInfo &SI = VI.Slices[DwordIdx];
+            const ValueSpecializer::DwordSpecializationInfo &DSI = VSI.DwordInfos[DwordIdx];
+            if (DSI.Kind == ValueSpecializer::SpecializationKind::Constant) {
+              if (SI.Status != ValueTracking::SliceStatus::Constant || SI.ConstantValue != DSI.ConstantValue)
+                report_fatal_error("Failed constant specialization");
+            }
+            if (DSI.Kind == ValueSpecializer::SpecializationKind::FrozenPoison) {
+              if (SI.Status != ValueTracking::SliceStatus::UndefOrPoison)
+                report_fatal_error("Failed frozen poison specialization");
+            }
+          }
+        }
+
+        dbgs() << "[VS]: Replaced " << NumReplacedDwords << " dwords in ";
+        VSI.Val->printAsOperand(dbgs());
+        if (Replacement) {
+          dbgs() << ", replaced by ";
+          Replacement->printAsOperand(dbgs());
+        }
+        dbgs() << "\n";
+      }
+    }
+  }
+
+  for (auto *CI : ToBeDeleted)
+    CI->eraseFromParent();
+
+  return PreservedAnalyses::none();
+}
+
+} // namespace CompilerUtils
diff --git a/compilerutils/lib/ValueSpecializationTestPass.h b/compilerutils/lib/ValueSpecializationTestPass.h
new file mode 100644
index 0000000000..20465412ef
--- /dev/null
+++ b/compilerutils/lib/ValueSpecializationTestPass.h
@@ -0,0 +1,42 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#pragma once
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace CompilerUtils {
+
+// Helper pass to enable lit tests of ValueSpecializer.
+// Calls to a function named "specialize" trigger value specialization.
+class ValueSpecializationTestPass : public llvm::PassInfoMixin<ValueSpecializationTestPass> {
+public:
+  llvm::PreservedAnalyses run(llvm::Module &Module, llvm::ModuleAnalysisManager &AnalysisManager);
+
+  static llvm::StringRef name() { return "Test ValueSpecialization"; }
+};
+
+} // namespace CompilerUtils
diff --git a/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil b/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil
index 3eae8bf5ab..1bf2fa9f61 100644
--- a/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil
+++ b/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil
@@ -94,6 +94,19 @@ define i1 @test_struct_gep(ptr %arg, i32 %index) {
   ret i1 %res
 }
 
+define i1 @test_shufflevector(<2 x i1> %args.0, <2 x i1> %args.1) {
+; CHECK-LABEL: define {{[^@]+}}@test_shufflevector
+; CHECK-SAME: (<2 x i32> [[ARGS_0:%.*]], <2 x i32> [[ARGS_1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[ARGS_0]], <2 x i32> [[ARGS_1]], <1 x i32> <i32 1>
+; CHECK-NEXT:    [[RES2:%.*]] = extractelement <1 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[RES2]] to i1
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %tmp = shufflevector <2 x i1> %args.0, <2 x i1> %args.1, <1 x i32> <i32 1>
+  %res = extractelement <1 x i1> %tmp, i32 0
+  ret i1 %res
+}
+
 define void @test_pointee_metadata(<7 x i1>, ptr) !types !1 {
 ; CHECK-LABEL: define {{[^@]+}}@test_pointee_metadata
 ; CHECK-SAME: (<7 x i32> [[TMP0:%.*]], ptr [[TMP1:%.*]]) !types [[META2:![0-9]+]] {
diff --git a/compilerutils/test/value-origin-tracking/assumptions.ll b/compilerutils/test/value-origin-tracking/assumptions.ll
new file mode 100644
index 0000000000..874630a20c
--- /dev/null
+++ b/compilerutils/test/value-origin-tracking/assumptions.ll
@@ -0,0 +1,72 @@
+; RUN: opt -passes="value-origin-tracking-test" -S %s | FileCheck %s
+
+declare void @analyze(...)
+
+; Intrinsic to declare value origin assumptions.
+; Syntax:
+;   call void @assume(%val, [constantOrDynamicValue], i32 dynamicValueByteOffset, [...])
+declare void @assume(...)
+
+declare i32 @opaque()
+
+define void @testSimpleAssumptions(i32 %arg) {
+; CHECK-LABEL: testSimpleAssumptions
+
+  %opaque = call i32 @opaque()
+; CHECK: %opaque = call i32 @opaque()): Dynamic: %opaque = {{.*}} (offset 0)
+  call void @analyze(i32 %opaque)
+
+  %opaque.with.constant.assumption = call i32 @opaque()
+  call void @assume(i32 %opaque.with.constant.assumption, i32 u0xbeef, i32 0)
+; CHECK: %opaque.with.constant.assumption = call i32 @opaque()): Constant: 0xbeef
+  call void @analyze(i32 %opaque.with.constant.assumption)
+
+  %opaque.with.dynamic.assumption = call i32 @opaque()
+  call void @assume(i32 %opaque.with.dynamic.assumption, i32 %arg, i32 0)
+; CHECK: %opaque.with.dynamic.assumption = call i32 @opaque()): Dynamic (argument): i32 %arg (offset 0)
+  call void @analyze(i32 %opaque.with.dynamic.assumption)
+
+  %opaque.with.self.assumption = call i32 @opaque()
+  call void @assume(i32 %opaque.with.self.assumption, i32 %opaque.with.self.assumption, i32 0)
+; CHECK: %opaque.with.self.assumption = call i32 @opaque()): Dynamic: %opaque.with.self.assumption {{.*}} (offset 0)
+  call void @analyze(i32 %opaque.with.self.assumption)
+
+  %opaque.with.nested.assumption = call i32 @opaque()
+  call void @assume(i32 %opaque.with.nested.assumption, i32 %opaque.with.dynamic.assumption, i32 0)
+; CHECK: %opaque.with.nested.assumption = call i32 @opaque()): Dynamic (argument): i32 %arg (offset 0)
+  call void @analyze(i32 %opaque.with.nested.assumption)
+
+  %derived = bitcast i32 %opaque.with.nested.assumption to float
+; CHECK: %derived = bitcast i32 %opaque.with.nested.assumption to float): Dynamic (argument): i32 %arg (offset 0)
+  call void @analyze(float %derived)
+
+  ; Test that we currently don't merge assumptions with our own analysis on the same value:
+  ; A trivial assumption can lead to worse results.
+  %trivial = bitcast i32 0 to float
+  call void @assume(float %trivial, float %trivial, i32 0)
+; CHECK: %trivial = bitcast i32 0 to float): Dynamic: %trivial
+  call void @analyze(float %trivial)
+
+  ret void
+}
+
+declare [3 x i32] @permute([3 x i32] %arr)
+
+; Test assumptions on larger types with nontrivial offsets
+; Add assumptions assuming that @permute permutes the input array.
+; After three rounds, we should get back the original one.
+define void @testAssumptionsWithOffsets([3 x i32] %arg) {
+; CHECK-LABEL: testAssumptionsWithOffsets
+  %permuted.0 = call [3 x i32] @permute([3 x i32] %arg)
+  call void @assume([3 x i32] %permuted.0, [3 x i32] %arg, i32 4, [3 x i32] %arg, i32 8, [3 x i32] %arg, i32 0)
+  %permuted.1 = call [3 x i32] @permute([3 x i32] %permuted.0)
+  call void @assume([3 x i32] %permuted.1, [3 x i32] %permuted.0, i32 4, [3 x i32] %permuted.0, i32 8, [3 x i32] %permuted.0, i32 0)
+; CHECK: %permuted.1 = {{.*}}: Dynamic (argument): [3 x i32] %arg (offset 8); Dynamic (argument): [3 x i32] %arg (offset 0); Dynamic (argument): [3 x i32] %arg (offset 4)
+  call void @analyze([3 x i32] %permuted.1)
+  %permuted.final = call [3 x i32] @permute([3 x i32] %permuted.1)
+  call void @assume([3 x i32] %permuted.final, [3 x i32] %permuted.1, i32 4, [3 x i32] %permuted.1, i32 8, [3 x i32] %permuted.1, i32 0)
+; CHECK: %permuted.final = {{.*}}: Dynamic (argument): [3 x i32] %arg (offset 0); Dynamic (argument): [3 x i32] %arg (offset 4); Dynamic (argument): [3 x i32] %arg (offset 8)
+  call void @analyze([3 x i32] %permuted.final)
+
+  ret void
+}
diff --git a/compilerutils/test/value-origin-tracking/basic-tests.ll b/compilerutils/test/value-origin-tracking/basic-tests.ll
new file mode 100644
index 0000000000..d722c02f77
--- /dev/null
+++ b/compilerutils/test/value-origin-tracking/basic-tests.ll
@@ -0,0 +1,317 @@
+; RUN: opt -passes="value-origin-tracking-test" -S %s | FileCheck %s
+
+declare void @analyze(...)
+
+define void @testConstantInt() {
+; CHECK-LABEL: testConstantInt
+
+; CHECK: (i1 true): Constant: 0x1
+  call void @analyze(i1 true)
+
+; CHECK: (i8 16): Constant: 0x10
+  call void @analyze(i8 16)
+
+; CHECK: (i16 17): Constant: 0x11
+  call void @analyze(i16 17)
+
+; CHECK: (i32 64): Constant: 0x40
+  call void @analyze(i32 64)
+
+; CHECK: (i64 4294967311): Constant: 0xf; Constant: 0x1
+  call void @analyze(i64 u0x10000000f)
+
+  ret void
+}
+
+define void @testConstantFloat() {
+; CHECK-LABEL: testConstantFloat
+
+; CHECK: (half 0xH1234): Constant: 0x1234
+  call void @analyze(half 0xH1234)
+
+; CHECK: (float 1.250000e-01): Constant: 0x3e000000
+  call void @analyze(float 1.250000e-01)
+
+; CHECK: (double 0x123456789ABCDEF): Constant: 0x89abcdef; Constant: 0x1234567
+  call void @analyze(double 0x0123456789abcdef)
+
+; Check that float "zero" is not incorrectly handled as "null"
+; CHECK: (float -0.000000e+00): Constant: 0x80000000
+  call void @analyze(float -0.0)
+
+; CHECK: (float 1.250000e-01): Constant: 0x3e000000
+  call void @analyze(float bitcast (i32 u0x3e000000 to float))
+  ret void
+}
+
+define void @testConstantVector() {
+; CHECK-LABEL: testConstantVector
+
+; CHECK: (<2 x i32> zeroinitializer): Constant: 0x0; Constant: 0x0
+  call void @analyze(<2 x i32> zeroinitializer)
+
+; CHECK: (<9 x i8> zeroinitializer): Constant: 0x0; Constant: 0x0; Constant: 0x0
+  call void @analyze(<9 x i8> zeroinitializer)
+
+; CHECK: (<1 x i32> <i32 -559038737>): Constant: 0xdeadbeef
+  call void @analyze(<1 x i32> <i32 u0xdeadbeef>)
+
+; CHECK: (<4 x i8> <i8 1, i8 2, i8 3, i8 4>): Constant: 0x4030201
+  call void @analyze(<4 x i8> <i8 1, i8 2, i8 3, i8 4>)
+
+; CHECK: (<1 x float> <float 1.250000e-01>): Constant: 0x3e000000
+  call void @analyze(<1 x float> <float 1.250000e-01>)
+
+; computeKnownBits only supports integer vectors, and our
+; handling doesn't support smaller-than-slice element types.
+; CHECK: (<1 x half> <half 0xH1234>): Dynamic
+  call void @analyze(<1 x half> <half 0xH1234>)
+
+; CHECK: (<4 x float> <float 0.000000e+00, float 2.560000e+02, float 0.000000e+00, float undef>): Constant: 0x0; Constant: 0x43800000; Constant: 0x0; UndefOrPoison
+  call void @analyze(<4 x float> <float 0.000000e+00, float bitcast (i32 u0x43800000 to float), float 0.000000e+00, float undef>)
+
+  ret void
+}
+
+define void @testConstantArray() {
+; CHECK-LABEL: testConstantArray
+
+; CHECK: ([2 x i32] zeroinitializer): Constant: 0x0; Constant: 0x0
+  call void @analyze([2 x i32] zeroinitializer)
+
+; CHECK: ([9 x i8] zeroinitializer): Constant: 0x0; Constant: 0x0; Constant: 0x0
+  call void @analyze([9 x i8] zeroinitializer)
+
+; CHECK: ([1 x i32] [i32 -559038737]): Constant: 0xdeadbeef
+  call void @analyze([1 x i32] [i32 u0xdeadbeef])
+
+; In contrast to vectors, we can't detect constant arrays of small types.
+; This is because llvm computeKnownBits supports vectors but not arrays,
+; and our handling of constant arrays/vectors doesn't support element types
+; smaller than slices.
+; CHECK: ([4 x i8] c"\01\02\03\04"): Dynamic
+  call void @analyze([4 x i8] [i8 1, i8 2, i8 3, i8 4])
+
+; CHECK: ([1 x float] [float 1.250000e-01]): Constant: 0x3e000000
+  call void @analyze([1 x float] [float 1.250000e-01])
+
+; CHECK: ([4 x float] [float 0.000000e+00, float 2.560000e+02, float 0.000000e+00, float undef]): Constant: 0x0; Constant: 0x43800000; Constant: 0x0; UndefOrPoison
+  call void @analyze([4 x float] [float 0.000000e+00, float bitcast (i32 u0x43800000 to float), float 0.000000e+00, float undef])
+
+  ret void
+}
+
+%somestruct = type { i32, i8, half }
+define void @testConstantStruct() {
+; CHECK-LABEL: testConstantStruct
+; Only support zeroinitializer for now
+
+; CHECK: (%somestruct zeroinitializer): Constant: 0x0; Constant: 0x0
+  call void @analyze(%somestruct zeroinitializer)
+
+; CHECK: (%somestruct { i32 1, i8 1, half 0xH0000 }): Dynamic: {{.*}} (offset 0); Dynamic: {{.*}} (offset 4)
+  call void @analyze(%somestruct { i32 1, i8 1, half 0xH0 })
+
+  ret void
+}
+
+define void @testDynamic(i32 %arg) {
+; CHECK-LABEL: testDynamic
+; CHECK: (i32 %arg): Dynamic (argument): i32 %arg (offset 0)
+  call void @analyze(i32 %arg)
+  ret void
+}
+
+define void @testPoison() {
+; CHECK-LABEL: testPoison
+; CHECK: (i1 poison): UndefOrPoison
+  call void @analyze(i1 poison)
+; CHECK: (i32 poison): UndefOrPoison
+  call void @analyze(i32 poison)
+; CHECK: (double poison): UndefOrPoison; UndefOrPoison
+  call void @analyze(double poison)
+
+  %freezePoison = freeze i32 poison
+; CHECK: (  %freezePoison = {{.*}}): UndefOrPoison
+  call void @analyze(i32 %freezePoison)
+
+  %freezeNonPoison = freeze i32 5
+; CHECK: (  %freezeNonPoison = {{.*}}): Constant: 0x5
+  call void @analyze(i32 %freezeNonPoison)
+  ret void
+}
+
+define void @testArray(i32 %arg) {
+; CHECK-LABEL: testArray
+  %arr.1 = insertvalue [3 x i32] poison, i32 100, 0
+  %arr.2 = insertvalue [3 x i32] %arr.1, i32 %arg, 1
+  %extract.0 = extractvalue [3 x i32] %arr.2, 0
+  %extract.1 = extractvalue [3 x i32] %arr.2, 1
+  %extract.2 = extractvalue [3 x i32] %arr.2, 2
+; CHECK: (  %extract.0 = extractvalue [3 x i32] %arr.2, 0): Constant: 0x64
+  call void @analyze(i32 %extract.0)
+; CHECK: (  %extract.1 = extractvalue [3 x i32] %arr.2, 1): Dynamic (argument): i32 %arg (offset 0)
+  call void @analyze(i32 %extract.1)
+; CHECK: (  %extract.2 = extractvalue [3 x i32] %arr.2, 2): UndefOrPoison
+  call void @analyze(i32 %extract.2)
+  ret void
+}
+
+define void @testVector(i32 %arg) {
+; CHECK-LABEL: testVector
+  %vec.1 = insertelement <3 x i32> poison, i32 100, i32 0
+  %vec.2 = insertelement <3 x i32> %vec.1, i32 %arg, i32 1
+  %extract.0 = extractelement <3 x i32> %vec.2, i32 0
+  %extract.1 = extractelement <3 x i32> %vec.2, i32 1
+  %extract.2 = extractelement <3 x i32> %vec.2, i32 2
+  %extract.dyn = extractelement <3 x i32> %vec.2, i32 %arg
+; CHECK: (  %extract.0 = extractelement <3 x i32> %vec.2, i32 0): Constant: 0x64
+  call void @analyze(i32 %extract.0)
+; CHECK: (  %extract.1 = extractelement <3 x i32> %vec.2, i32 1): Dynamic (argument): i32 %arg (offset 0)
+  call void @analyze(i32 %extract.1)
+; CHECK: (  %extract.2 = extractelement <3 x i32> %vec.2, i32 2): UndefOrPoison
+  call void @analyze(i32 %extract.2)
+; CHECK: (  %extract.dyn = extractelement <3 x i32> %vec.2, i32 %arg): Dynamic:   %extract.dyn = extractelement <3 x i32> %vec.2, i32 %arg (offset 0)
+  call void @analyze(i32 %extract.dyn)
+
+  ; Test that inserting an i1 into the middle of a dword doesn't accidentally overwrite high bits
+  %insert.i1 = insertelement <32 x i1> zeroinitializer, i1 1, i32 16
+; CHECK: (  %insert.i1 = {{.*}}): Dynamic
+  call void @analyze(<32 x i1> %insert.i1)
+
+  ret void
+}
+
+define void @testBitcast(i32 %arg) {
+; CHECK-LABEL: testBitcast
+  %bitcast = bitcast i32 %arg to float
+; CHECK: (  %bitcast = bitcast i32 %arg to float): Dynamic (argument): i32 %arg (offset 0)
+  call void @analyze(float %bitcast)
+  ret void
+}
+
+define void @testSelect(i32 %arg1, i1 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, i1 %arg6, i1 %arg7) {
+; CHECK-LABEL: testSelect
+  %sel.1 = select i1 %arg2, i32 %arg1, i32 -1
+; CHECK: (  %sel.1 = select i1 %arg2, i32 %arg1, i32 -1): (Constant: 0xffffffff | Dynamic (argument): i32 %arg1 (offset 0))
+  call void @analyze(i32 %sel.1)
+
+; Consistent constant, in different order:
+  %sel.2 = select i1 %arg3, i32 -1, i32 %sel.1
+; CHECK: (  %sel.2 = select i1 %arg3, i32 -1, i32 %sel.1): (Constant: 0xffffffff | Dynamic (argument): i32 %arg1 (offset 0))
+  call void @analyze(i32 %sel.2)
+
+; Inconsistent constants mean we don't know anything:
+  %sel.3 = select i1 %arg5, i32 %sel.2, i32 0
+; CHECK: (  %sel.3 = select i1 %arg5, i32 %sel.2, i32 0): Dynamic:   %sel.3 = select i1 %arg5, i32 %sel.2, i32 0 (offset 0)
+  call void @analyze(i32 %sel.3)
+
+; Consistent dynamic value:
+  %arg1.bc.float = bitcast i32 %arg1 to float
+  %arg1.bc.i32 = bitcast float %arg1.bc.float to i32
+  %sel.4 = select i1 %arg6, i32 %sel.2, i32 %arg1.bc.i32
+; CHECK: (  %sel.4 = select i1 %arg6, i32 %sel.2, i32 %arg1.bc.i32): (Constant: 0xffffffff | Dynamic (argument): i32 %arg1 (offset 0))
+  call void @analyze(i32 %sel.4)
+
+; Inconsistent dynamic value means we don't know anything
+  %sel.5 = select i1 %arg6, i32 %sel.2, i32 %arg4
+; CHECK: (  %sel.5 = select i1 %arg6, i32 %sel.2, i32 %arg4): Dynamic:   %sel.5 = select i1 %arg6, i32 %sel.2, i32 %arg4 (offset 0)
+  call void @analyze(i32 %sel.5)
+
+; Add in poison:
+  %sel.6 = select i1 %arg7, i32 %sel.2, i32 poison
+; CHECK: (  %sel.6 = select i1 %arg7, i32 %sel.2, i32 poison): (UndefOrPoison | Constant: 0xffffffff | Dynamic (argument): i32 %arg1 (offset 0))
+  call void @analyze(i32 %sel.6)
+
+  ret void
+}
+
+define void @testPhi(i32 %arg1, i1 %arg2, i1 %arg3, [5 x i32] %arg4) {
+; CHECK-LABEL: testPhi
+entry:
+  %empty = phi i32
+; CHECK: (  %empty = phi i32 ): Dynamic:   %empty = phi i32  (offset 0)
+  call void @analyze(i32 %empty)
+
+  br i1 %arg2, label %bb1, label %bb2
+bb1:
+  %phi.arg = phi i32 [ %arg1, %entry ]
+; CHECK: (  %phi.arg = phi i32 [ %arg1, %entry ]): Dynamic (argument): i32 %arg1 (offset 0)
+  call void @analyze(i32 %phi.arg)
+  br label %bb2
+bb2:
+  %phi.argOrConst = phi i32 [ %arg1, %entry ], [ 1, %bb1]
+; CHECK: (  %phi.argOrConst = phi i32 [ %arg1, %entry ], [ 1, %bb1 ]): (Constant: 0x1 | Dynamic (argument): i32 %arg1 (offset 0))
+  call void @analyze(i32 %phi.argOrConst)
+  br label %loop.entry
+loop.entry:
+  %phi.loop.constant = phi i32 [ 1, %bb2 ], [ 1, %loop.entry]
+  %phi.loop.propagate = phi i32 [ 1, %bb2 ], [ %phi.loop.propagate, %loop.entry]
+
+; CHECK: (  %phi.loop.constant = phi i32 [ 1, %bb2 ], [ 1, %loop.entry ]): Constant: 0x1
+  call void @analyze(i32 %phi.loop.constant)
+; %phi.loop.propagate is always constant, but figuring this out requires propagating
+; multiple times through the loop until a stable state is reached, which we don't do:
+; CHECK: (  %phi.loop.propagate = {{.*}}: Dynamic:   %phi.loop.propagate = phi
+  call void @analyze(i32 %phi.loop.propagate)
+  br i1 %arg3, label %loop.entry, label %bb.startmany
+bb.startmany:
+  switch i32 %arg1, label %exit [ i32 0, label %bb.many.0
+                                  i32 1, label %bb.many.1
+                                  i32 2, label %bb.many.2
+                                  i32 3, label %bb.many.3
+                                  i32 4, label %bb.many.4 ]
+bb.many.0:
+  %arr.0 = insertvalue [5 x i32] %arg4, i32 0, 0
+  br label %bb.many.exit
+bb.many.1:
+  %arr.1 = insertvalue [5 x i32] %arg4, i32 1, 1
+  br label %bb.many.exit
+bb.many.2:
+  %arr.2 = insertvalue [5 x i32] %arg4, i32 2, 2
+  br label %bb.many.exit
+bb.many.3:
+  %arr.3 = insertvalue [5 x i32] %arg4, i32 3, 3
+  br label %bb.many.exit
+bb.many.4:
+  %arr.4 = insertvalue [5 x i32] %arg4, i32 4, 4
+  br label %bb.many.exit
+bb.many.exit:
+  %arr.phi = phi [5 x i32] [ %arr.0, %bb.many.0 ], [ %arr.1, %bb.many.1 ], [ %arr.2, %bb.many.2 ], [ %arr.3, %bb.many.3 ], [ %arr.4, %bb.many.4 ]
+; CHECK: (  %arr.phi = phi {{.*}}): (Constant: 0x0 | Dynamic (argument): [5 x i32] %arg4 (offset 0)); (Constant: 0x1 | Dynamic (argument): [5 x i32] %arg4 (offset 4)); (Constant: 0x2 | Dynamic (argument): [5 x i32] %arg4 (offset 8)); (Constant: 0x3 | Dynamic (argument): [5 x i32] %arg4 (offset 12)); (Constant: 0x4 | Dynamic (argument): [5 x i32] %arg4 (offset 16))
+  call void @analyze([5 x i32] %arr.phi)
+  br label %exit
+exit:
+  ret void
+}
+
+; This is a regression test against an earlier problem with the order in which we analyze
+; values. We need to processed operands before processing an instruction itself, i.e. in an topological order.
+; If there are cycles, we need to give up on some dependencies (supposedly only back dependencies to phi nodes).
+define void @testProcessOrder(i32 %arg1, i1 %cond) {
+; CHECK-LABEL: testProcessOrder
+; This fails with DFS order: We push a and b to the stack when checking c.
+; Then we process b, and see that a is already on the stack, so we don't push a to the stack again.
+; After having processed arg1, b is on top of the stack, so we pop it and analyze it, but a is still unknown.
+  %a = select i1 %cond, i32 %arg1, i32 7
+  %b = select i1 %cond, i32 %a, i32 7
+  %c = select i1 %cond, i32 %a, i32 %b
+; CHECK: (  %c = select {{.*}}): (Constant: 0x7 | Dynamic (argument): i32 %arg1 (offset 0))
+  call void @analyze(i32 %c)
+  ret void
+}
+
+; For unsupported instructions (e.g. add), we try to use computeKnownBits as last fallback.
+; This allows to detect some simple cases as well.
+define void @testDynamicComputeKnownBits(i32 %arg1, i1 %cond) {
+; CHECK-LABEL: testDynamicComputeKnownBits
+  %add = add i32 1, 2
+; CHECK: (  %add = add {{.*}}): Constant: 0x3
+  call void @analyze(i32 %add)
+
+; computeKnownBits only supports integers:
+; CHECK: (  %fadd = fadd {{.*}}): Dynamic
+  %fadd = fadd float 1.0, 2.0
+  call void @analyze(float %fadd)
+  ret void
+}
diff --git a/compilerutils/test/value-origin-tracking/max-value-size.ll b/compilerutils/test/value-origin-tracking/max-value-size.ll
new file mode 100644
index 0000000000..9170031314
--- /dev/null
+++ b/compilerutils/test/value-origin-tracking/max-value-size.ll
@@ -0,0 +1,14 @@
+; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-max-bytes-per-value=4 | FileCheck %s --check-prefix=CHECK-SMALL
+; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-max-bytes-per-value=1024 | FileCheck %s --check-prefix=CHECK-HUGE
+
+declare void @analyze(...)
+
+define void @test() {
+; CHECK-LABEL: test
+  %arr = insertvalue [256 x i32] poison, i32 7, 255
+  %val = extractvalue [256 x i32] %arr, 255
+; CHECK-SMALL: (  %val = extractvalue [256 x i32] %arr, 255): Dynamic:   %val = extractvalue [256 x i32] %arr, 255 (offset 0)
+; CHECK-HUGE:  (  %val = extractvalue [256 x i32] %arr, 255): Constant: 0x7
+  call void @analyze(i32 %val)
+  ret void
+}
diff --git a/compilerutils/test/value-origin-tracking/slice-sizes.ll b/compilerutils/test/value-origin-tracking/slice-sizes.ll
new file mode 100644
index 0000000000..147a6ce1bc
--- /dev/null
+++ b/compilerutils/test/value-origin-tracking/slice-sizes.ll
@@ -0,0 +1,73 @@
+; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-bytes-per-slice=1 | FileCheck %s --check-prefix=CHECK1
+; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-bytes-per-slice=4 | FileCheck %s --check-prefix=CHECK4
+
+declare void @analyze(...)
+
+define void @testConstant() {
+; CHECK-LABEL: testConstant
+; CHECK1: (i32 -5601263): Constant: 0x11; Constant: 0x88; Constant: 0xaa; Constant: 0xff
+; CHECK4: (i32 -5601263): Constant: 0xffaa8811
+  call void @analyze(i32 u0xffaa8811)
+  ret void
+}
+
+define void @testArray(i8 %arg) {
+; CHECK-LABEL: testArray
+  %arr.1 = insertvalue [3 x i8] poison, i8 u0xff, 0
+  %arr.2 = insertvalue [3 x i8] %arr.1, i8 poison, 1
+  %arr.3 = insertvalue [3 x i8] %arr.2, i8 %arg, 2
+; CHECK1: (  %arr.3 = {{.*}}: Constant: 0xff; UndefOrPoison; Dynamic (argument): i8 %arg (offset 0)
+; CHECK4: (  %arr.3 = {{.*}}: Dynamic: {{.*}} (offset 0)
+  call void @analyze([3 x i8] %arr.3)
+  ret void
+}
+
+; Check that inserting a value into a range that is not slice-aligned invalidates
+; the affected slices, but preserves the other ones.
+; We insert the i16 at index 3 into this packed struct, which covers bytes 7 and 8.
+; This touches two dwords, so with dword-sized slices the two middle dwords are dynamic.
+; Byte-sized slices however nicely deal with it.
+%packed.struct = type <{i32,  i16, i8,  i16, i16,  i8,    i32 }>
+;         Indices:      0     1    2    3    4     5      6
+;         Byte ranges:  0..3  4..5 6..6 7..8 9..10 11..11 12..15
+;                interesting value: ----^^^^
+define void @testMisalignedInsertExtract() {
+; CHECK-LABEL: testMisalignedInsertExtract
+; CHECK1: (  %inserted.3 = {{.*}}): Constant: 0xff; Constant: 0xff; Constant: 0xff; Constant: 0xff; Constant: 0x0; Constant: 0x0; Constant: 0x1; Constant: 0xff; Constant: 0xff
+; CHECK1-SAME: Constant: 0x0; Constant: 0x0; Constant: 0x0; Constant: 0x0; Constant: 0x0; Constant: 0x0; Constant: 0x0
+; CHECK4: (  %inserted.3 = {{.*}}): Constant: 0xffffffff; Dynamic: {{.*}}; Dynamic: {{.*}}; Constant: 0x0
+  %inserted.1 = insertvalue %packed.struct zeroinitializer, i32 -1, 0
+  %inserted.2 = insertvalue %packed.struct %inserted.1, i8  1, 2
+  %inserted.3 = insertvalue %packed.struct %inserted.2, i16 -1, 3
+  call void @analyze(%packed.struct %inserted.3)
+
+; CHECK1: (  %extracted = {{.*}}): Constant: 0x0
+; CHECK4: (  %extracted = {{.*}}): Dynamic
+  %extracted = extractvalue %packed.struct zeroinitializer, 3
+  call void @analyze(i16 %extracted)
+
+  ret void
+}
+
+; Test that inserting/extracting a value that is slice-aligned but smaller than a slice works correctly
+; We insert/extract the i16 at index 1 in this struct:
+%packed.struct.1 = type <{i32, i16, i16, i32 }>
+;       interesting value: ----^^^^
+define void @testAlignedSubSliceInsertExtract() {
+; CHECK-LABEL: testAlignedSubSliceInsertExtract
+  %inserted.1 = insertvalue %packed.struct.1 zeroinitializer, i32 -1, 0
+  %extracted.1 = extractvalue %packed.struct.1 %inserted.1, 1
+  %inserted.2 = insertvalue %packed.struct.1 %inserted.1, i16 1, 1
+  %extracted.2 = extractvalue %packed.struct.1 %inserted.2, 1
+
+; CHECK1: (  %extracted.1 = {{.*}}): Constant: 0x0; Constant: 0x0
+; CHECK4: (  %extracted.1 = {{.*}}): Constant: 0x0
+  call void @analyze(i16 %extracted.1)
+
+; CHECK1: (  %extracted.2 = {{.*}}): Constant: 0x1; Constant: 0x0
+; We don't support partial insertions, so treat this conservatively:
+; CHECK4: (  %extracted.2 = {{.*}}): Dynamic
+  call void @analyze(i16 %extracted.2)
+
+  ret void
+}
diff --git a/compilerutils/test/value-origin-tracking/vector.ll b/compilerutils/test/value-origin-tracking/vector.ll
new file mode 100644
index 0000000000..b7aa3bb5cc
--- /dev/null
+++ b/compilerutils/test/value-origin-tracking/vector.ll
@@ -0,0 +1,68 @@
+; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-bytes-per-slice=1 | FileCheck %s --check-prefix=CHECK1
+; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-bytes-per-slice=4 | FileCheck %s --check-prefix=CHECK4
+;
+; Test vector ops on types that aren't byte-aligned (i1) and overaligned (i16)
+target datalayout = "i16:32"
+
+declare void @analyze(...)
+
+define void @testi1(i32 %arg) {
+; CHECK-LABEL: testi1
+  %vec.0 = insertelement <16 x i1> poison, i1 1, i32 0
+  %vec.1 = insertelement <16 x i1> %vec.0, i1 0, i32 1
+  %vec.2 = insertelement <16 x i1> %vec.1, i1 1, i32 2
+  %vec.3 = insertelement <16 x i1> %vec.2, i1 0, i32 3
+  %vec.4 = insertelement <16 x i1> %vec.3, i1 1, i32 4
+  %vec.5 = insertelement <16 x i1> %vec.4, i1 0, i32 5
+  %vec.6 = insertelement <16 x i1> %vec.5, i1 1, i32 6
+  %vec.7 = insertelement <16 x i1> %vec.6, i1 0, i32 7
+  %vec.8 = insertelement <16 x i1> %vec.7, i1 1, i32 8
+  %vec.9 = insertelement <16 x i1> %vec.8, i1 0, i32 9
+  %vec.10 = insertelement <16 x i1> %vec.9, i1 1, i32 10
+  call void @analyze(<16 x i1> %vec.10)
+; CHECK1: (  %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10): Dynamic:   %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10 (offset 0); Dynamic:   %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10 (offset 1)
+; CHECK4: (  %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10): Dynamic:   %vec.10 = insertelement <16 x i1> %vec.9, i1 true, i32 10 (offset 0)
+  ret void
+}
+
+define void @testi1InsertExtract() {
+; CHECK-LABEL: testi1Extract
+; We don't support sub-byte inserts/extractions yet, as demonstrated in this test
+  %vec.1 = bitcast i32 -1 to <32 x i1>
+  %extract.1 = extractelement <32 x i1> %vec.1, i32 0
+; CHECK: (  %extract.1 = extractelement <32 x i1> %vec.1, i32 0): Dynamic: {{.*}} (offset 0)
+  call void @analyze(i1 %extract.1)
+  %vec.2 = bitcast i32 0 to <32 x i1>
+  %vec.3 = insertelement <32 x i1> %vec.2, i1 1, i32 8
+  call void @analyze(<32 x i1> %vec.3)
+; CHECK: (  %vec.3 = insertelement <32 x i1> %vec.2, i1 true, i32 8): Dynamic: {{.*}} (offset 0); Dynamic: {{.*}} (offset 1); Dynamic: {{.*}} (offset 2); Dynamic: {{.*}} (offset 3)
+  ret void
+}
+
+define void @testi16(i32 %arg) {
+; CHECK-LABEL: testi16
+  %vec.1 = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vec.2 = insertelement <4 x i16> %vec.1, i16 0, i32 1
+; CHECK1: %vec.2 = {{.*}}: Constant: 0xff; Constant: 0xff; Constant: 0x0; Constant: 0x0; UndefOrPoison; UndefOrPoison; UndefOrPoison; UndefOrPoison
+; Sub-slice extract/insert isn't supported:
+; CHECK4: %vec.2 = {{.*}}: Dynamic
+  call void @analyze(<4 x i16> %vec.2)
+; CHECK1: %extract.1 = {{.*}}): Constant: 0xff; Constant: 0xff
+; CHECK4: %extract.1 = {{.*}}: Dynamic
+  %extract.1 = extractelement <4 x i16> %vec.2, i32 0
+  call void @analyze(i16 %extract.1)
+; CHECK1: %extract.2 = {{.*}}): Constant: 0x0; Constant: 0x0
+; CHECK4: %extract.2 = {{.*}}: Dynamic
+  %extract.2 = extractelement <4 x i16> %vec.2, i32 1
+  call void @analyze(i16 %extract.2)
+  ret void
+}
+
+; Regression test for computeKnownBits handling of vectors
+define void @testShuffleVector(i32 %arg) {
+; CHECK-LABEL: testShuffleVector
+  %vec = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 0>
+; CHECK: %vec = shufflevector {{.*}}: Dynamic
+  call void @analyze(<2 x i32> %vec)
+  ret void
+}
diff --git a/compilerutils/test/value-specialization/specialization.ll b/compilerutils/test/value-specialization/specialization.ll
new file mode 100644
index 0000000000..87ca44f057
--- /dev/null
+++ b/compilerutils/test/value-specialization/specialization.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+;
+; RUN: opt --verify-each -passes='value-specialization-test' -S %s | FileCheck %s
+;
+; Intentionally align i64 to 64 bits so we can test specializations within types with padding,
+; and align float to 16 bits to test misaligned dword-sized scalars.
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:16-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+; Syntax:
+; call void @specialize(i32 %flags, <ty> %val, i32 %dw0Kind, i32 %dw0Constant, [...])
+; flag bits:
+;   skip value tracking check:  0x1
+;   allow replacement failures: 0x2 (if not set, fail if any dword replacement on this value fails)
+; Kind values:
+;   None: 0
+;   Constant: 1
+;   FrozenPoison: 2
+declare void @specialize(...)
+declare void @use(...)
+
+define void @SimpleScalars(i32 %arg0, i32 %arg1, i32 %arg2, float %arg3) {
+; CHECK-LABEL: define void @SimpleScalars(
+; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], float [[ARG3:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 poison
+; CHECK-NEXT:    call void (...) @use(i32 [[ARG0]], i32 42, i32 [[TMP1]], float 0x3744E40000000000)
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, i32 %arg0, i32 0, i32 poison)
+  call void @specialize(i32 0, i32 %arg1, i32 1, i32 42)
+  call void @specialize(i32 0, i32 %arg2, i32 2, i32 poison)
+  call void @specialize(i32 0, float %arg3, i32 1, i32 1337)
+  call void (...) @use(i32 %arg0, i32 %arg1, i32 %arg2, float %arg3)
+  ret void
+}
+
+; I64 specialization is "special", as we potentially specialize low and high dwords separately.
+; Test all non-trivial combinations:
+;          (low dword)    (high dword)
+;  * arg0: None         + Constant
+;  * arg1: None         + FrozenPoison
+;  * arg2: Constant     + None
+;  * arg3: Constant     + FrozenPoison
+;  * arg4: FrozenPoison + None
+;  * arg5: FrozenPoison + Constant
+; as well as uniform ones:
+;  * arg6: Constant     + Constant
+;  * arg7: FrozenPoison + FrozenPoison
+;
+; Don't check with value tracking (flags=1) as it does not support the used bitwise operations.
+define void @I64s(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7) {
+; CHECK-LABEL: define void @I64s(
+; CHECK-SAME: i64 [[ARG0:%.*]], i64 [[ARG1:%.*]], i64 [[ARG2:%.*]], i64 [[ARG3:%.*]], i64 [[ARG4:%.*]], i64 [[ARG5:%.*]], i64 [[ARG6:%.*]], i64 [[ARG7:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[ARG0]], 4294967295
+; CHECK-NEXT:    [[ARG0_SPECIALIZED:%.*]] = or i64 [[TMP1]], 4294967296
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[ARG1]], 4294967295
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 poison
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -4294967296
+; CHECK-NEXT:    [[ARG1_SPECIALIZED:%.*]] = or i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[ARG2]], -4294967296
+; CHECK-NEXT:    [[ARG2_SPECIALIZED:%.*]] = or i64 2, [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = freeze i64 poison
+; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[TMP6]], -4294967296
+; CHECK-NEXT:    [[ARG3_SPECIALIZED:%.*]] = or i64 3, [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = freeze i64 poison
+; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 4294967295
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[ARG4]], -4294967296
+; CHECK-NEXT:    [[ARG4_SPECIALIZED:%.*]] = or i64 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze i64 poison
+; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP11]], 4294967295
+; CHECK-NEXT:    [[ARG5_SPECIALIZED:%.*]] = or i64 [[TMP12]], 17179869184
+; CHECK-NEXT:    [[TMP13:%.*]] = freeze i64 poison
+; CHECK-NEXT:    call void (...) @use(i64 [[ARG0_SPECIALIZED]], i64 [[ARG1_SPECIALIZED]], i64 [[ARG2_SPECIALIZED]], i64 [[ARG3_SPECIALIZED]], i64 [[ARG4_SPECIALIZED]], i64 [[ARG5_SPECIALIZED]], i64 25769803781, i64 [[TMP13]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 1, i64 %arg0, i32 0, i32 poison, i32 1, i32 1)
+  call void @specialize(i32 1, i64 %arg1, i32 0, i32 poison, i32 2, i32 poison)
+  call void @specialize(i32 1, i64 %arg2, i32 1, i32 2,      i32 0, i32 poison)
+  call void @specialize(i32 1, i64 %arg3, i32 1, i32 3,      i32 2, i32 poison)
+  call void @specialize(i32 1, i64 %arg4, i32 2, i32 poison, i32 0, i32 poison)
+  call void @specialize(i32 1, i64 %arg5, i32 2, i32 poison, i32 1, i32 4)
+  call void @specialize(i32 1, i64 %arg6, i32 1, i32 5,      i32 1, i32 6)
+  call void @specialize(i32 1, i64 %arg7, i32 2, i32 poison, i32 2, i32 poison)
+  call void (...) @use(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7)
+  ret void
+}
+
+define void @Double(double %arg) {
+; CHECK-LABEL: define void @Double(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    call void (...) @use(double 2.075080e-322)
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 1, double %arg, i32 1, i32 42, i32 1, i32 0)
+  call void (...) @use(double %arg)
+  ret void
+}
+
+; ptr is 64 bits wide, ptr addrspace (20) is 32 bits wide
+define void @Pointers(ptr %arg0, ptr addrspace(20) %arg1) {
+; CHECK-LABEL: define void @Pointers(
+; CHECK-SAME: ptr [[ARG0:%.*]], ptr addrspace(20) [[ARG1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[ARG0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 42, [[TMP2]]
+; CHECK-NEXT:    [[ARG0_SPECIALIZED:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void (...) @use(ptr [[ARG0_SPECIALIZED]], ptr addrspace(20) inttoptr (i32 43 to ptr addrspace(20)))
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 1, ptr %arg0, i32 1, i32 42, i32 0, i32 poison)
+  call void @specialize(i32 1, ptr addrspace(20) %arg1, i32 1, i32 43)
+  call void (...) @use(ptr %arg0, ptr addrspace(20) %arg1)
+  ret void
+}
+
+define void @Array([3 x i32] %args) {
+; CHECK-LABEL: define void @Array(
+; CHECK-SAME: [3 x i32] [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue [3 x i32] [[ARGS]], i32 42, 1
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 poison
+; CHECK-NEXT:    [[ARGS_SPECIALIZED1:%.*]] = insertvalue [3 x i32] [[ARGS_SPECIALIZED]], i32 [[TMP1]], 2
+; CHECK-NEXT:    call void (...) @use([3 x i32] [[ARGS_SPECIALIZED1]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, [3 x i32] %args, i32 0, i32 poison, i32 1, i32 42, i32 2, i32 poison)
+  call void (...) @use([3 x i32] %args)
+  ret void
+}
+
+define void @Struct({ i32, i32, i32 } %args) {
+; CHECK-LABEL: define void @Struct(
+; CHECK-SAME: { i32, i32, i32 } [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, i32, i32 } [[ARGS]], i32 42, 1
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 poison
+; CHECK-NEXT:    [[ARGS_SPECIALIZED1:%.*]] = insertvalue { i32, i32, i32 } [[ARGS_SPECIALIZED]], i32 [[TMP1]], 2
+; CHECK-NEXT:    call void (...) @use({ i32, i32, i32 } [[ARGS_SPECIALIZED1]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, { i32, i32, i32 } %args, i32 0, i32 poison, i32 1, i32 42, i32 2, i32 poison)
+  call void (...) @use({ i32, i32, i32 } %args)
+  ret void
+}
+
+define void @Vector(<3 x i32> %args) {
+; CHECK-LABEL: define void @Vector(
+; CHECK-SAME: <3 x i32> [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertelement <3 x i32> [[ARGS]], i32 42, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 poison
+; CHECK-NEXT:    [[ARGS_SPECIALIZED1:%.*]] = insertelement <3 x i32> [[ARGS_SPECIALIZED]], i32 [[TMP1]], i64 2
+; CHECK-NEXT:    call void (...) @use(<3 x i32> [[ARGS_SPECIALIZED1]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, <3 x i32> %args, i32 0, i32 poison, i32 1, i32 42, i32 2, i32 poison)
+  call void (...) @use(<3 x i32> %args)
+  ret void
+}
+
+; Test that when replacing some but not all dwords of a nested struct, we directly insertvalue into the outer struct
+define void @NestedStructPartialReplace({ i32, { i32, i32 } } %args) {
+; CHECK-LABEL: define void @NestedStructPartialReplace(
+; CHECK-SAME: { i32, { i32, i32 } } [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, { i32, i32 } } [[ARGS]], i32 42, 1, 0
+; CHECK-NEXT:    call void (...) @use({ i32, { i32, i32 } } [[ARGS_SPECIALIZED]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, { i32, { i32, i32 } } %args, i32 0, i32 poison, i32 1, i32 42, i32 0, i32 poison)
+  call void (...) @use({ i32, { i32, i32 } } %args)
+  ret void
+}
+
+; Test that when replacing some but not all dwords of a nested vector, we first extract the old vector,
+; insert replacements, and then insert the replaced vector
+define void @NestedVectorWithPartialReplace({ i32, <2 x i32>} %args) {
+; CHECK-LABEL: define void @NestedVectorWithPartialReplace(
+; CHECK-SAME: { i32, <2 x i32> } [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, <2 x i32> } [[ARGS]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 42, i64 0
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, <2 x i32> } [[ARGS]], <2 x i32> [[TMP2]], 1
+; CHECK-NEXT:    call void (...) @use({ i32, <2 x i32> } [[ARGS_SPECIALIZED]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, { i32, <2 x i32>} %args, i32 0, i32 poison, i32 1, i32 42, i32 0, i32 poison)
+  call void (...) @use({ i32, <2 x i32>} %args)
+  ret void
+}
+
+; Test that when replacing multiple but not all dwords of a nested vector, we first extract the old vector,
+; insert all replacements, and then insert the replaced vector just once
+define void @NestedVectorWithPartialMultiReplace({ i32, <3 x i32>} %args) {
+; CHECK-LABEL: define void @NestedVectorWithPartialMultiReplace(
+; CHECK-SAME: { i32, <3 x i32> } [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, <3 x i32> } [[ARGS]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x i32> [[TMP1]], i32 42, i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 43, i64 1
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, <3 x i32> } [[ARGS]], <3 x i32> [[TMP3]], 1
+; CHECK-NEXT:    call void (...) @use({ i32, <3 x i32> } [[ARGS_SPECIALIZED]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, { i32, <3 x i32>} %args, i32 0, i32 poison, i32 1, i32 42, i32 1, i32 43, i32 0, i32 poison)
+  call void (...) @use({ i32, <3 x i32>} %args)
+  ret void
+}
+
+; Test that when replacing all dwords of a nested vector, we inserted the replacement values
+; into a new frozen poison vector, and then insertvalue that into the struct.
+define void @NestedVectorWithFullReplace({ i32, <2 x i32>} %args) {
+; CHECK-LABEL: define void @NestedVectorWithFullReplace(
+; CHECK-SAME: { i32, <2 x i32> } [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze <2 x i32> poison
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 42, i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 43, i64 1
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, <2 x i32> } [[ARGS]], <2 x i32> [[TMP3]], 1
+; CHECK-NEXT:    call void (...) @use({ i32, <2 x i32> } [[ARGS_SPECIALIZED]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, { i32, <2 x i32>} %args, i32 0, i32 poison, i32 1, i32 42, i32 1, i32 43)
+  call void (...) @use({ i32, <2 x i32>} %args)
+  ret void
+}
+
+; There is a padding dword before the nested struct, because i64 is 64-bit aligned.
+; Check that replacing dword index 4 correctly replaces the nested i32.
+define void @NestedStructWithPadding({ i32, { i64, i32 } } %args) {
+; CHECK-LABEL: define void @NestedStructWithPadding(
+; CHECK-SAME: { i32, { i64, i32 } } [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, { i64, i32 } } [[ARGS]], i32 42, 1, 1
+; CHECK-NEXT:    call void (...) @use({ i32, { i64, i32 } } [[ARGS_SPECIALIZED]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, { i32, { i64, i32 } } %args, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 1, i32 42, i32 0, i32 poison)
+  call void (...) @use({ i32 , { i64, i32 } } %args)
+  ret void
+}
+
+define void @NestedAll({ i32, [ 2 x { i32, <2 x i32> } ] } %args) {
+; CHECK-LABEL: define void @NestedAll(
+; CHECK-SAME: { i32, [2 x { i32, <2 x i32> }] } [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, [2 x { i32, <2 x i32> }] } [[ARGS]], 1, 1, 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 42, i64 1
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, [2 x { i32, <2 x i32> }] } [[ARGS]], <2 x i32> [[TMP2]], 1, 1, 1
+; CHECK-NEXT:    call void (...) @use({ i32, [2 x { i32, <2 x i32> }] } [[ARGS_SPECIALIZED]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 0, { i32, [ 2 x { i32, <2 x i32> } ] } %args, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 1, i32 42)
+  call void (...) @use({ i32, [ 2 x { i32, <2 x i32> } ] } %args)
+  ret void
+}
+
+define void @FailSmallTypes(i1 %arg0, i8 %arg1, i16 %arg2, half %arg3) {
+; CHECK-LABEL: define void @FailSmallTypes(
+; CHECK-SAME: i1 [[ARG0:%.*]], i8 [[ARG1:%.*]], i16 [[ARG2:%.*]], half [[ARG3:%.*]]) {
+; CHECK-NEXT:    call void (...) @use(i1 [[ARG0]], i8 [[ARG1]], i16 [[ARG2]], half [[ARG3]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 3, i1 %arg0, i32 1, i32 1)
+  call void @specialize(i32 3, i8 %arg1, i32 1, i32 1)
+  call void @specialize(i32 3, i16 %arg2, i32 1, i32 1)
+  call void @specialize(i32 3, half %arg3, i32 1, i32 1)
+  call void (...) @use(i1 %arg0, i8 %arg1, i16 %arg2, half %arg3)
+  ret void
+}
+
+; These are not supported yet, but we could add support later. It would require splitting constant values though.
+define void @FailSmallTypesInAggregates(<2 x i16> %arg0, [2 x i16] %arg1) {
+; CHECK-LABEL: define void @FailSmallTypesInAggregates(
+; CHECK-SAME: <2 x i16> [[ARG0:%.*]], [2 x i16] [[ARG1:%.*]]) {
+; CHECK-NEXT:    call void (...) @use(<2 x i16> [[ARG0]], [2 x i16] [[ARG1]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 3, <2 x i16> %arg0, i32 1, i32 1)
+  call void @specialize(i32 3, [2 x i16] %arg1, i32 1, i32 1)
+  call void (...) @use(<2 x i16> %arg0, [2 x i16] %arg1)
+  ret void
+}
+
+; Test that replacing into the storage of a misaligned dword-sized scalar fails
+; Replacing the first float succeeds, because it is dword-aligned, the second replacement should fail.
+define void @FailMisalignedDwordScalar({ float, i16, float, float, i16 } %args) {
+; CHECK-LABEL: define void @FailMisalignedDwordScalar(
+; CHECK-SAME: { float, i16, float, float, i16 } [[ARGS:%.*]]) {
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { float, i16, float, float, i16 } [[ARGS]], float 0x36F5000000000000, 0
+; CHECK-NEXT:    call void (...) @use({ float, i16, float, float, i16 } [[ARGS_SPECIALIZED]])
+; CHECK-NEXT:    ret void
+;
+  call void @specialize(i32 3, { float, i16, float, float, i16 } %args, i32 1, i32 42, i32 0, i32 poison, i32 1, i32 43, i32 0, i32 poison)
+  call void (...) @use({ float, i16, float, float, i16 } %args)
+  ret void
+}
+
+; Specialize a value in control flow, testing that we insert instructions at the correct place.
+define void @ControlFlow([2 x i32] %arg0, i1 %arg1, i1 %arg2) {
+; CHECK-LABEL: define void @ControlFlow(
+; CHECK-SAME: [2 x i32] [[ARG0:%.*]], i1 [[ARG1:%.*]], i1 [[ARG2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[ARG1]], label [[LOOP:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[LOOPDEP:%.*]] = phi [2 x i32] [ [[ARG0]], [[ENTRY:%.*]] ], [ [[INSERTED_SPECIALIZED:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractvalue [2 x i32] [[LOOPDEP]], 0
+; CHECK-NEXT:    [[INCR:%.*]] = add i32 [[EXTRACT]], 1
+; CHECK-NEXT:    [[INSERTED:%.*]] = insertvalue [2 x i32] [[LOOPDEP]], i32 [[INCR]], 0
+; CHECK-NEXT:    [[INSERTED_SPECIALIZED]] = insertvalue [2 x i32] [[INSERTED]], i32 42, 1
+; CHECK-NEXT:    call void (...) @use([2 x i32] [[INSERTED_SPECIALIZED]])
+; CHECK-NEXT:    br i1 [[ARG2]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %arg1, label %loop, label %exit
+loop:
+  %loopdep = phi [2 x i32] [ %arg0, %entry ], [ %inserted, %loop ]
+  %extract = extractvalue [2 x i32] %loopdep, 0
+  %incr = add i32 %extract, 1
+  %inserted = insertvalue [2 x i32] %loopdep, i32 %incr, 0
+  call void @specialize(i32 0, [2 x i32] %inserted, i32 0, i32 poison, i32 1, i32 42)
+  call void (...) @use([2 x i32] %inserted)
+  br i1 %arg2, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/docs/DdnDebugPrintf.md b/docs/DdnDebugPrintf.md
index 9e4036b28e..5fea0b6feb 100644
--- a/docs/DdnDebugPrintf.md
+++ b/docs/DdnDebugPrintf.md
@@ -206,7 +206,7 @@ Example:
 
 #### lgc::LowerDebugPrintf
 
-The module pass `LowerDebugPrintf` runs just before `PatchEntryPointMutate`.
+The module pass `LowerDebugPrintf` runs just before `MutateEntryPoint`.
 It collects all calls to `@lgc.debug.printf` in the entire module and:
 
 * Collects the format strings and adds the `amdpal.format_strings` entry to the
diff --git a/imported/llvm-dialects b/imported/llvm-dialects
index 6ff7d39046..bdfb113d8d 160000
--- a/imported/llvm-dialects
+++ b/imported/llvm-dialects
@@ -1 +1 @@
-Subproject commit 6ff7d39046e280e446fd69aa08c6c6524c68c728
+Subproject commit bdfb113d8d765bdf4554a2b30ae909b93f26aeea
diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h
index d1bfd86971..e22e24e337 100644
--- a/include/vkgcDefs.h
+++ b/include/vkgcDefs.h
@@ -435,6 +435,27 @@ enum class ResourceLayoutScheme : unsigned {
                ///  3. descriptor set index for each set
 };
 
+/// Specifies compile-time values for a single slot in a constant buffer.
+struct CompileTimeConst {
+  uint32_t offset;     ///< Which constant buffer slot (i.e., vec4) contains these values.
+  uint32_t set;        ///< Uniform set.
+  uint32_t binding;    ///< Uniform binding.
+  uint32_t validBytes; ///< Mask of which bytes in the values array are valid (are provided by the caller). This
+                       ///< is not a mask but the actual valid bytes count from first bit of 'values' as LLPC
+                       ///< could provide enough type info..
+  union {
+    uint32_t u32[4]; ///< The compile-time values as an array of 32-bit values (x, y, z, w).
+    uint16_t u16[8]; ///< The compile-time values as an array of 16-bit values.
+    uint8_t u8[16];  ///< The compile-time values as an array of 8-bit values.
+  } values;          ///< The compile-time values for this slot.
+};
+
+/// Represents info of compile-time constants within a shader of a specified stage.
+struct CompileConstInfo {
+  unsigned numCompileTimeConstants;        ///< Number of compile time constants.
+  CompileTimeConst *pCompileTimeConstants; ///< Actual compile time constants data, for uniform value replacement.
+};
+
 /// Represents per pipeline options.
 struct PipelineOptions {
   bool includeDisassembly;         ///< If set, the disassembly for all compiled shaders will be included in
@@ -515,6 +536,7 @@ struct PipelineOptions {
   bool enablePrimGeneratedQuery; ///< If set, primitive generated query is enabled
   bool disablePerCompFetch;      ///< Disable per component fetch in uber fetch shader.
   bool reserved21;
+  CompileConstInfo *compileConstInfo; ///< Compile time constant data.
 };
 
 /// Prototype of allocator for output data buffer, used in shader-specific operations.
@@ -903,6 +925,9 @@ struct PipelineShaderOptions {
 
   /// Let dmask bits be fully enabled when call 'image.sample.c', for depth compare mode swizzling workaround.
   bool imageSampleDrefReturnsRgba;
+
+  /// Application workaround: disable all fast math flags on gl_Position.
+  bool disableGlPositionOpt;
 };
 
 /// Represents YCbCr sampler meta data in resource descriptor
@@ -1315,6 +1340,7 @@ struct GraphicsPipelineBuildInfo {
   NggState nggState;                       ///< NGG state used for tuning and debugging
   PipelineOptions options;                 ///< Per pipeline tuning/debugging options
   bool unlinked;                           ///< True to build an "unlinked" half-pipeline ELF
+  bool enableInitUndefZero;                ///< True to initialize undefined variable
   bool dynamicVertexStride;                ///< Dynamic Vertex input Stride is enabled.
   bool enableUberFetchShader;              ///< Use uber fetch shader
   bool enableColorExportShader;            ///< Explicitly build color export shader, UnlinkedStageFragment elf will
diff --git a/include/vkgcGpurtShim.h b/include/vkgcGpurtShim.h
index 2246b36702..64521f5af2 100644
--- a/include/vkgcGpurtShim.h
+++ b/include/vkgcGpurtShim.h
@@ -38,7 +38,7 @@ namespace Vkgc {
 namespace gpurt {
 
 #ifdef HAVE_GPURT_SHIM
-void getShaderLibrarySpirv(unsigned featureFlags, const void *&code, size_t &size);
+void getShaderLibrarySpirv(Vkgc::RtIpVersion rtIpVersion, unsigned featureFlags, const void *&code, size_t &size);
 void getFuncTable(Vkgc::RtIpVersion rtIpVersion, Vkgc::GpurtFuncTable &table);
 Vkgc::RtIpVersion getRtIpVersion(Vkgc::GfxIpVersion gfxIpVersion);
 #endif
diff --git a/lgc/CMakeLists.txt b/lgc/CMakeLists.txt
index cf3417aaae..0d0ba569b4 100644
--- a/lgc/CMakeLists.txt
+++ b/lgc/CMakeLists.txt
@@ -149,16 +149,16 @@ target_sources(LLVMlgc PRIVATE
     patch/NggPrimShader.cpp
     patch/Patch.cpp
     patch/PatchBufferOp.cpp
-    patch/PatchCheckShaderCache.cpp
-    patch/PatchCopyShader.cpp
-    patch/PatchEntryPointMutate.cpp
-    patch/PatchImageDerivatives.cpp
-    patch/PatchInOutImportExport.cpp
-    patch/PatchInvariantLoads.cpp
-    patch/PatchLlvmIrInclusion.cpp
-    patch/PatchLoadScalarizer.cpp
-    patch/PatchMulDx9Zero.cpp
-    patch/PatchLoopMetadata.cpp
+    patch/CheckShaderCache.cpp
+    patch/GenerateCopyShader.cpp
+    patch/MutateEntryPoint.cpp
+    patch/LowerImageDerivatives.cpp
+    patch/LowerInOut.cpp
+    patch/LowerInvariantLoads.cpp
+    patch/IncludeLlvmIr.cpp
+    patch/ScalarizeLoads.cpp
+    patch/LowerMulDx9Zero.cpp
+    patch/AddLoopMetadata.cpp
     patch/PatchNullFragShader.cpp
     patch/PatchPeepholeOpt.cpp
     patch/PatchPreparePipelineAbi.cpp
@@ -172,7 +172,7 @@ target_sources(LLVMlgc PRIVATE
     patch/ShaderMerger.cpp
     patch/SystemValues.cpp
     patch/VertexFetch.cpp
-    patch/PatchImageOpCollect.cpp
+    patch/CollectImageOperations.cpp
     patch/RegisterMetadataBuilder.cpp
 #if VKI_BUILD_STRIX1
     patch/WorkaroundDsSubdwordWrite.cpp
diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp
index 088f655ed2..3023c51823 100644
--- a/lgc/builder/BuilderImpl.cpp
+++ b/lgc/builder/BuilderImpl.cpp
@@ -76,21 +76,18 @@ Type *BuilderBase::getConditionallyVectorizedTy(Type *elementTy, Type *maybeVecT
 Value *BuilderImpl::CreateDotProduct(Value *const vector1, Value *const vector2, const Twine &instName) {
   if (vector1->getType()->getScalarType()->isBFloatTy()) {
     assert(getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 11);
-    // amdgcn_fdot2_bf16_bf16 will be used.
+    // Note: v_dot2_bf16_bf16 only respects RTE mode according to HW spec. We must check the specified rounding mode
+    //       before using it. Also, v_dot2_bf16_bf16 doesn't respect signed zeros so we must check NSZ as well.
     const auto fp16RoundMode =
         getPipelineState()->getShaderModes()->getCommonShaderMode(m_shaderStage.value()).fp16RoundMode;
     const auto vectorTy = dyn_cast<FixedVectorType>(vector1->getType());
-    if (vectorTy && (fp16RoundMode == FpRoundMode::DontCare || fp16RoundMode == FpRoundMode::Even)) {
+    if (vectorTy && (fp16RoundMode == FpRoundMode::DontCare || fp16RoundMode == FpRoundMode::Even) &&
+        getFastMathFlags().noSignedZeros()) {
       int compCount = vectorTy->getNumElements();
       Value *result = nullptr;
 
       if (compCount % 2 == 0) {
-        // If all products are of the form +x * -0.0, then the result should be -0.0. This requires a -0.0
-        // initial value.
-        //
-        // However, we prefer +0.0 as initial value when signed zeros are disabled because it can be encoded as an
-        // inline constant.
-        result = ConstantFP::get(getBFloatTy(), getFastMathFlags().noSignedZeros() ? +0.0 : -0.0);
+        result = ConstantFP::get(getBFloatTy(), 0.0);
       } else {
         // If the component count is odd, prefer feeding the last product (odd one out) as initial value.
         Value *lhs = CreateExtractElement(vector1, compCount - 1);
diff --git a/lgc/builder/BuilderRecorder.cpp b/lgc/builder/BuilderRecorder.cpp
index 280838a6f3..143a7b5a6c 100644
--- a/lgc/builder/BuilderRecorder.cpp
+++ b/lgc/builder/BuilderRecorder.cpp
@@ -178,6 +178,8 @@ StringRef BuilderRecorder::getCallName(BuilderOpcode opcode) {
     return "get.desc.ptr";
   case BuilderOpcode::LoadPushConstantsPtr:
     return "load.push.constants.ptr";
+  case BuilderOpcode::SamplerFeedbackDesc:
+    return "sampler.feedback.desc";
   case BuilderOpcode::ReadGenericInput:
     return "read.generic.input";
   case BuilderOpcode::ReadPerVertexInput:
@@ -1084,6 +1086,16 @@ Value *Builder::CreateLoadPushConstantsPtr(const Twine &instName) {
   return record(BuilderOpcode::LoadPushConstantsPtr, getPtrTy(ADDR_SPACE_CONST), {}, instName);
 }
 
+// =====================================================================================================================
+// Merges a resource descriptor into a feedback descriptor to create a descriptor for sampler feedback instructions.
+//
+// @param feedbackDesc : feedback descriptor
+// @param resourceDesc : resource descriptor
+Value *Builder::CreateSamplerFeedbackDesc(Value *feedbackDesc, Value *resourceDesc, const Twine &instName) {
+  return record(BuilderOpcode::SamplerFeedbackDesc, getDescTy(ResourceNodeType::DescriptorResource),
+                {feedbackDesc, resourceDesc}, instName);
+}
+
 // =====================================================================================================================
 // Create an image load.
 //
@@ -2043,6 +2055,7 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRef<Valu
     case BuilderOpcode::SubgroupBallotFindLsb:
     case BuilderOpcode::SubgroupBallotFindMsb:
     case BuilderOpcode::SubgroupBallotInclusiveBitCount:
+    case BuilderOpcode::SamplerFeedbackDesc:
       // Functions that don't access memory.
       func->setDoesNotAccessMemory();
       break;
diff --git a/lgc/builder/BuilderRecorder.h b/lgc/builder/BuilderRecorder.h
index 8a9fd60521..cecb86a981 100644
--- a/lgc/builder/BuilderRecorder.h
+++ b/lgc/builder/BuilderRecorder.h
@@ -109,6 +109,7 @@ enum BuilderOpcode : unsigned {
   GetDescStride,
   GetDescPtr,
   LoadPushConstantsPtr,
+  SamplerFeedbackDesc,
 
   // Image
   ImageLoad,
diff --git a/lgc/builder/BuilderReplayer.cpp b/lgc/builder/BuilderReplayer.cpp
index 608b7014dc..b3e34c76bb 100644
--- a/lgc/builder/BuilderReplayer.cpp
+++ b/lgc/builder/BuilderReplayer.cpp
@@ -408,6 +408,10 @@ Value *BuilderReplayer::processCall(unsigned opcode, CallInst *call) {
     return m_builder->CreateLoadPushConstantsPtr();
   }
 
+  case BuilderOpcode::SamplerFeedbackDesc: {
+    return m_builder->CreateSamplerFeedbackDesc(args[0], args[1]);
+  }
+
   // Replayer implementations of ImageBuilder methods
   case BuilderOpcode::ImageLoad: {
     unsigned dim = cast<ConstantInt>(args[0])->getZExtValue();
diff --git a/lgc/builder/DescBuilder.cpp b/lgc/builder/DescBuilder.cpp
index bf716ec257..0b1374d0af 100644
--- a/lgc/builder/DescBuilder.cpp
+++ b/lgc/builder/DescBuilder.cpp
@@ -254,7 +254,7 @@ Value *BuilderImpl::CreateGetDescPtr(ResourceNodeType concreteType, ResourceNode
     // or phi node, we rely on subsequent LLVM optimizations promoting the value back to a constant.
     StringRef startGlobalName = lgcName::ImmutableSamplerGlobal;
     std::string globalName =
-        (startGlobalName + Twine(node->set) + "_" + Twine(node->binding) + "_" + Twine(node->visibility)).str();
+        (startGlobalName + Twine(node->set) + "_" + Twine(node->binding) + "_" + Twine(node->visibility.toRaw())).str();
     Module *module = GetInsertPoint()->getModule();
     descPtr = module->getGlobalVariable(globalName, /*AllowInternal=*/true);
     if (!descPtr) {
diff --git a/lgc/builder/ImageBuilder.cpp b/lgc/builder/ImageBuilder.cpp
index e24421b9f0..7e7aa56afb 100644
--- a/lgc/builder/ImageBuilder.cpp
+++ b/lgc/builder/ImageBuilder.cpp
@@ -650,7 +650,7 @@ Value *BuilderImpl::CreateImageLoad(Type *resultTy, unsigned dim, unsigned flags
 
     // Rectangle image uses the same Intrinsic ID with 2D image.
     Intrinsic::ID intrinsicId = (dim == DimRect) ? table[Dim2D] : table[dim];
-    imageInst = CreateIntrinsic(intrinsicId, {intrinsicDataTy, coords[0]->getType()}, args, nullptr, instName);
+    imageInst = CreateIntrinsic(intrinsicDataTy, intrinsicId, args, nullptr, instName);
   } else {
     // Texel buffer descriptor. Use the buffer instruction.
     imageDescArgIndex = args.size();
@@ -867,7 +867,7 @@ Value *BuilderImpl::CreateImageStore(Value *texel, unsigned dim, unsigned flags,
 
     // Rectangle image uses the same Intrinsic ID with 2D image.
     Intrinsic::ID intrinsicId = (dim == DimRect) ? table[Dim2D] : table[dim];
-    imageStore = CreateIntrinsic(intrinsicId, {texelTy, coords[0]->getType()}, args, nullptr, instName);
+    imageStore = CreateIntrinsic(getVoidTy(), intrinsicId, args, nullptr, instName);
   } else {
     // Texel buffer descriptor. Use the buffer instruction.
     // First widen texel to vec4 if necessary.
@@ -1148,9 +1148,6 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign
 
   // Build the intrinsic arguments and overloaded types.
   SmallVector<Value *, 16> args;
-  SmallVector<Type *, 4> overloadTys;
-  if (resultTy && !resultTy->isVoidTy())
-    overloadTys.push_back(resultTy);
 
   // Dmask.
   unsigned dmask = 15;
@@ -1185,10 +1182,8 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign
   }
 
   // Bias: float
-  if (Value *biasVal = address[ImageAddressIdxLodBias]) {
+  if (Value *biasVal = address[ImageAddressIdxLodBias])
     args.push_back(biasVal);
-    overloadTys.push_back(biasVal->getType());
-  }
 
   // ZCompare (dref)
   if (Value *zCompareVal = address[ImageAddressIdxZCompare]) {
@@ -1198,14 +1193,10 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign
   }
 
   // Grad (explicit derivatives)
-  if (!derivatives.empty()) {
-    args.insert(args.end(), derivatives.begin(), derivatives.end());
-    overloadTys.push_back(derivatives[0]->getType());
-  }
+  args.insert(args.end(), derivatives.begin(), derivatives.end());
 
   // Coordinate
   args.insert(args.end(), coords.begin(), coords.end());
-  overloadTys.push_back(coords[0]->getType());
 
   // LodClamp
   if (Value *lodClampVal = address[ImageAddressIdxLodClamp])
@@ -1252,7 +1243,7 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign
   Intrinsic::ID intrinsicId = (dim == DimRect) ? table->ids[Dim2D] : table->ids[dim];
 
   // Create the intrinsic.
-  Instruction *imageOp = CreateIntrinsic(intrinsicId, overloadTys, args, nullptr, instName);
+  Instruction *imageOp = CreateIntrinsic(resultTy, intrinsicId, args, nullptr, instName);
 
   // Add a waterfall loop if needed.
   SmallVector<unsigned, 2> nonUniformArgIndexes;
@@ -1368,8 +1359,12 @@ Value *BuilderImpl::CreateImageAtomicCommon(unsigned atomicOp, unsigned dim, uns
     // Rectangle image uses the same Intrinsic ID with 2D image.
     Intrinsic::ID intrinsicId =
         (dim == DimRect) ? ImageAtomicIntrinsicTable[atomicOp][Dim2D] : ImageAtomicIntrinsicTable[atomicOp][dim];
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION >= 511095
+    atomicInst = CreateIntrinsic(inputValue->getType(), intrinsicId, args, nullptr, instName);
+#else
     atomicInst = CreateIntrinsic(intrinsicId, {inputValue->getType(), coord->getType()->getScalarType()}, args, nullptr,
                                  instName);
+#endif
   } else {
     // Texel buffer descriptor. Use the buffer atomic instruction.
     args.push_back(inputValue);
@@ -1660,8 +1655,8 @@ Value *BuilderImpl::CreateImageGetLod(unsigned dim, unsigned flags, Value *image
   args.push_back(getInt32(0));    // tfe/lwe
   args.push_back(getInt32(0));    // glc/slc
 
-  Instruction *result = CreateIntrinsic(ImageGetLodIntrinsicTable[dim],
-                                        {FixedVectorType::get(getFloatTy(), 2), getFloatTy()}, args, nullptr, instName);
+  Instruction *result =
+      CreateIntrinsic(FixedVectorType::get(getFloatTy(), 2), ImageGetLodIntrinsicTable[dim], args, nullptr, instName);
 
   SmallVector<unsigned, 2> nonUniformArgIndexes;
   if (imageDesc->getType()->isVectorTy()) {
@@ -2184,3 +2179,24 @@ Value *BuilderImpl::transformSamplerDesc(Value *samplerDesc) {
   cast<Instruction>(desc)->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(getContext(), {}));
   return desc;
 }
+
+// =====================================================================================================================
+// Merges a resource descriptor into a feedback descriptor to create a descriptor for sampler feedback instructions.
+//
+// @param feedbackDesc : feedback descriptor
+// @param resourceDesc : resource descriptor
+// @param instName : Name to give instruction(s)
+// @returns Descriptor for use with sampler feedback image sample calls
+Value *BuilderImpl::CreateSamplerFeedbackDesc(Value *feedbackDesc, Value *resourceDesc, const Twine &instName) {
+  GfxIpVersion gfxIp = getPipelineState()->getTargetInfo().getGfxIpVersion();
+  SqImgRsrcRegHandler feedbackRsrc(this, feedbackDesc, &gfxIp);
+  SqImgRsrcRegHandler resourceRsrc(this, feedbackDesc, &gfxIp);
+
+  feedbackRsrc.setReg(SqRsrcRegs::BaseLevel, resourceRsrc.getReg(SqRsrcRegs::BaseLevel));
+  feedbackRsrc.setReg(SqRsrcRegs::LastLevel, resourceRsrc.getReg(SqRsrcRegs::LastLevel));
+  feedbackRsrc.setReg(SqRsrcRegs::Depth, resourceRsrc.getReg(SqRsrcRegs::Depth));
+  feedbackRsrc.setReg(SqRsrcRegs::BaseArray, resourceRsrc.getReg(SqRsrcRegs::BaseArray));
+  feedbackRsrc.setReg(SqRsrcRegs::MinLod, resourceRsrc.getReg(SqRsrcRegs::MinLod));
+
+  return feedbackRsrc.getRegister();
+}
diff --git a/lgc/builder/InOutBuilder.cpp b/lgc/builder/InOutBuilder.cpp
index 9502c6af2e..44439f30a6 100644
--- a/lgc/builder/InOutBuilder.cpp
+++ b/lgc/builder/InOutBuilder.cpp
@@ -2033,33 +2033,35 @@ void BuilderImpl::markBuiltInOutputUsage(BuiltInKind builtIn, unsigned arraySize
   }
 
   case ShaderStage::Geometry: {
-    switch (builtIn) {
-    case BuiltInPointSize:
-      usage.gs.pointSize = true;
-      break;
-    case BuiltInPosition:
-      usage.gs.position = true;
-      break;
-    case BuiltInClipDistance:
-      usage.gs.clipDistance = std::max(usage.gs.clipDistance, arraySize);
-      break;
-    case BuiltInCullDistance:
-      usage.gs.cullDistance = std::max(usage.gs.cullDistance, arraySize);
-      break;
-    case BuiltInPrimitiveId:
-      usage.gs.primitiveId = true;
-      break;
-    case BuiltInViewportIndex:
-      usage.gs.viewportIndex = true;
-      break;
-    case BuiltInLayer:
-      usage.gs.layer = true;
-      break;
-    case BuiltInPrimitiveShadingRate:
-      usage.gs.primitiveShadingRate = true;
-      break;
-    default:
-      break;
+    if (streamId == m_pipelineState->getRasterizerState().rasterStream) {
+      switch (builtIn) {
+      case BuiltInPointSize:
+        usage.gs.pointSize = true;
+        break;
+      case BuiltInPosition:
+        usage.gs.position = true;
+        break;
+      case BuiltInClipDistance:
+        usage.gs.clipDistance = std::max(usage.gs.clipDistance, arraySize);
+        break;
+      case BuiltInCullDistance:
+        usage.gs.cullDistance = std::max(usage.gs.cullDistance, arraySize);
+        break;
+      case BuiltInPrimitiveId:
+        usage.gs.primitiveId = true;
+        break;
+      case BuiltInViewportIndex:
+        usage.gs.viewportIndex = true;
+        break;
+      case BuiltInLayer:
+        usage.gs.layer = true;
+        break;
+      case BuiltInPrimitiveShadingRate:
+        usage.gs.primitiveShadingRate = true;
+        break;
+      default:
+        break;
+      }
     }
     break;
   }
diff --git a/lgc/builder/MatrixBuilder.cpp b/lgc/builder/MatrixBuilder.cpp
index e3ed620d3e..1ad7306101 100644
--- a/lgc/builder/MatrixBuilder.cpp
+++ b/lgc/builder/MatrixBuilder.cpp
@@ -363,6 +363,8 @@ Type *BuilderCommon::transCooperativeMatrixElementType(CooperativeMatrixElementT
   case CooperativeMatrixElementType::Int32:
     return getInt32Ty();
   case CooperativeMatrixElementType::Int8:
+  case CooperativeMatrixElementType::Float8:
+  case CooperativeMatrixElementType::BFloat8:
     return getInt8Ty();
   default:
     llvm_unreachable("The element type is not supported.");
@@ -411,6 +413,8 @@ bool BuilderCommon::isTypeNCooperativeMatrix(CooperativeMatrixElementType elemTy
     width = 32;
     break;
   case lgc::CooperativeMatrixElementType::Int8:
+  case lgc::CooperativeMatrixElementType::Float8:
+  case lgc::CooperativeMatrixElementType::BFloat8:
     width = 8;
     break;
   default:
diff --git a/lgc/builder/SubgroupBuilder.cpp b/lgc/builder/SubgroupBuilder.cpp
index 65490d9861..598097433e 100644
--- a/lgc/builder/SubgroupBuilder.cpp
+++ b/lgc/builder/SubgroupBuilder.cpp
@@ -1,4 +1,4 @@
-/*
+﻿/*
  ***********************************************************************************************************************
  *
  *  Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All Rights Reserved.
@@ -442,14 +442,7 @@ Value *BuilderImpl::createSubgroupShuffle(Value *const value, Value *const index
     return result;
   }
 
-  auto mapFunc = [this](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
-                        ArrayRef<Value *> passthroughArgs) -> Value * {
-    Value *const readlane =
-        builder.CreateIntrinsic(builder.getInt32Ty(), Intrinsic::amdgcn_readlane, {mappedArgs[0], passthroughArgs[0]});
-    return createWaterfallLoop(cast<Instruction>(readlane), 1);
-  };
-
-  return CreateMapToSimpleType(mapFunc, value, index);
+  return createShuffleLoop(value, index, shaderStage);
 }
 
 // =====================================================================================================================
@@ -1418,6 +1411,26 @@ Value *BuilderImpl::createInverseBallotSelect(uint64_t selectMask, Value *const
   return CreateSelect(inverseBallot, value1, value2);
 }
 
+// =====================================================================================================================
+// Do group ballot with all active threads participated, turning a boolean value (in a VGPR) into a subgroup-wide
+// shared SGPR.
+//
+// @param value : The value to contribute to the SGPR, must be an boolean type.
+Value *BuilderImpl::createGroupBallotAllActive(Value *const value) {
+  // Check the type is definitely an boolean.
+  assert(value->getType()->isIntegerTy(1));
+
+  Value *result = value;
+  unsigned waveSize = getShaderWaveSize();
+  result = CreateIntrinsic(getIntNTy(waveSize), Intrinsic::amdgcn_ballot, result);
+
+  // If we have a 32-bit subgroup size, we need to turn the 32-bit ballot result into a 64-bit result.
+  if (waveSize <= 32)
+    result = CreateZExt(result, getInt64Ty());
+
+  return result;
+}
+
 // =====================================================================================================================
 // Do group ballot, turning a per-lane boolean value (in a VGPR) into a subgroup-wide shared SGPR.
 //
@@ -1435,15 +1448,7 @@ Value *BuilderImpl::createGroupBallot(Value *const value, ShaderStageEnum shader
     auto isLive = CreateIntrinsic(Intrinsic::amdgcn_live_mask, {}, {}, nullptr, {});
     result = CreateAnd(isLive, result);
   }
-
-  unsigned waveSize = getShaderWaveSize();
-  result = CreateIntrinsic(getIntNTy(waveSize), Intrinsic::amdgcn_ballot, result);
-
-  // If we have a 32-bit subgroup size, we need to turn the 32-bit ballot result into a 64-bit result.
-  if (waveSize <= 32)
-    result = CreateZExt(result, getInt64Ty());
-
-  return result;
+  return createGroupBallotAllActive(result);
 }
 
 // =====================================================================================================================
@@ -1454,6 +1459,87 @@ Value *BuilderImpl::createGroupBallot(Value *const value) {
   return createGroupBallot(value, m_shaderStage.value());
 }
 
+// =====================================================================================================================
+// Create a traditional loop for subgroup shuffle.
+//
+// This is done in three steps:
+// 1. Collect the active lane mask for loop condition.
+//
+// 2. Check whether the shuffle index of each lane is equal to the shuffle index of first lane. If so, update the value
+//    of the current lane.
+//
+// 3. Update the first lane by update work list.
+//
+// Pseudo code:
+// result = poison
+// workList = ballot(true)
+// do {
+//     firstLaneIdx = find_first_set(workList)
+//     currentSrcLaneIdx = readlane(srcLaneIdx, firstLaneIdx)
+//     notCurrentLane = srcLaneIdx != currentSrcLaneIdx
+//     CreateMapToSimpleType
+//       value = readlane(srcData, currentSrcLaneIdx)
+//       result = notCurrentLane ? result : value
+//     workList &= ballot(notCurrentLane)
+// }
+// while (workList != 0)
+//
+// @param value : The value to shuffle.
+// @param index : The index to shuffle from.
+// @param instName : Name to give instruction(s)
+llvm::Value *BuilderImpl::createShuffleLoop(llvm::Value *const value, llvm::Value *const index,
+                                            ShaderStageEnum shaderStage, const llvm::Twine &instName) {
+  assert(value != nullptr && index != nullptr);
+  // Return readlane directly, if the index is a constant value.
+  if (isa<Constant>(index))
+    return CreateIntrinsic(getInt32Ty(), Intrinsic::amdgcn_readlane, {value, index});
+
+  // Creat workList out of loop
+  // By implementation, the Insert point has been set to the callInst when call processCall
+  auto *loopPoint = &*(GetInsertPoint());
+  auto *originalBlock = loopPoint->getParent();
+
+  // We are forcing all active threads participate the shuffle because CreateSubgroupClusteredMultiExclusive()
+  // depends on this to be correct.
+  // TODO: Refine the code or algorithm so that createShuffleLoop is no longer affected by external code
+  // implementations.
+  auto *workList = createGroupBallotAllActive(getTrue());
+
+  // Init loop block.
+  auto *loop = originalBlock->splitBasicBlock(loopPoint, ".shuffleLoop");
+  auto *loopNext = loop->splitBasicBlock(loop->getFirstInsertionPt());
+  SetInsertPoint(loop->getFirstInsertionPt());
+
+  Type *waveSize = workList->getType();
+  auto *resultPhi = CreatePHI(value->getType(), 2);
+  auto *workListPhi = CreatePHI(workList->getType(), 2);
+  resultPhi->addIncoming(PoisonValue::get(value->getType()), originalBlock);
+  workListPhi->addIncoming(workList, originalBlock);
+  auto *firstLaneIndex =
+      CreateZExtOrTrunc(CreateIntrinsic(Intrinsic::cttz, waveSize, {workListPhi, getTrue()}), getInt32Ty());
+  // In each loop iteration, the lanes with the same shuffle source index are being processed together. So,
+  // the iteration count will be equal to the count of unique values of the shuffle index.
+  Value *const currentSrcLaneIndex =
+      CreateIntrinsic(index->getType(), Intrinsic::amdgcn_readlane, {index, firstLaneIndex});
+  auto *notCurrentLane = CreateICmpNE(index, currentSrcLaneIndex);
+  auto mapFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs, ArrayRef<Value *> passthroughArgs) -> Value * {
+    Value *const index = passthroughArgs[0];
+    Value *const result = mappedArgs[0];
+    Value *const srcDate = mappedArgs[1];
+    Value *const value = builder.CreateIntrinsic(srcDate->getType(), Intrinsic::amdgcn_readlane, {srcDate, index});
+    return builder.CreateSelect(passthroughArgs[1], result, value);
+  };
+  auto result = CreateMapToSimpleType(mapFunc, {resultPhi, value}, {currentSrcLaneIndex, notCurrentLane});
+  auto newWorkList = CreateAnd(createGroupBallotAllActive(notCurrentLane), workListPhi);
+  resultPhi->addIncoming(result, loop);
+  workListPhi->addIncoming(newWorkList, loop);
+  auto *cond = CreateICmpEQ(newWorkList, ConstantInt::get(waveSize, 0));
+  CreateCondBr(cond, loopNext, loop);
+  loop->back().eraseFromParent();
+  SetInsertPoint(loopPoint);
+  return result;
+}
+
 // =====================================================================================================================
 // Search the MSB index of the mask, not handle zero.
 //
diff --git a/lgc/disassembler/Disassembler.cpp b/lgc/disassembler/Disassembler.cpp
index 8dc125c0a3..0466a4b548 100644
--- a/lgc/disassembler/Disassembler.cpp
+++ b/lgc/disassembler/Disassembler.cpp
@@ -257,7 +257,9 @@ void ObjDisassembler::run() {
   std::unique_ptr<MCRegisterInfo> regInfo(m_target->createMCRegInfo(m_tripleName));
   if (!regInfo)
     report_fatal_error(m_data.getBufferIdentifier() + ": No register info for target");
-  std::unique_ptr<MCAsmInfo> asmInfo(m_target->createMCAsmInfo(*regInfo, m_tripleName, MCTargetOptions()));
+  MCTargetOptions targetOptions{};
+  targetOptions.AsmVerbose = true;
+  std::unique_ptr<MCAsmInfo> asmInfo(m_target->createMCAsmInfo(*regInfo, m_tripleName, targetOptions));
   if (!asmInfo)
     report_fatal_error(m_data.getBufferIdentifier() + ": No assembly info for target");
   m_subtargetInfo.reset(m_target->createMCSubtargetInfo(m_tripleName, *mcpu, features.getString()));
@@ -267,7 +269,7 @@ void ObjDisassembler::run() {
   if (!instrInfo)
     report_fatal_error(m_data.getBufferIdentifier() + ": No instruction info for target");
 
-  MCContext context(triple, asmInfo.get(), regInfo.get(), m_subtargetInfo.get());
+  MCContext context(triple, asmInfo.get(), regInfo.get(), m_subtargetInfo.get(), nullptr, &targetOptions);
   std::unique_ptr<MCObjectFileInfo> objFileInfo(m_target->createMCObjectFileInfo(context, /*PIC=*/false));
   if (!objFileInfo)
     report_fatal_error("No MC object file info");
@@ -282,9 +284,12 @@ void ObjDisassembler::run() {
     report_fatal_error(m_data.getBufferIdentifier() + ": No instruction printer for target");
 
   auto fostream = std::make_unique<formatted_raw_ostream>(m_ostream);
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 505779
   m_streamer.reset(m_target->createAsmStreamer(*m_context, std::move(fostream), true, false, m_instPrinter, nullptr,
                                                nullptr, false));
-
+#else
+  m_streamer.reset(m_target->createAsmStreamer(*m_context, std::move(fostream), m_instPrinter, nullptr, nullptr));
+#endif
   // Process each section.
   for (ELFSectionRef sectionRef : m_objFile->sections())
     processSection(sectionRef);
diff --git a/lgc/elfLinker/ElfLinker.cpp b/lgc/elfLinker/ElfLinker.cpp
index 7fdda5b912..967675eda5 100644
--- a/lgc/elfLinker/ElfLinker.cpp
+++ b/lgc/elfLinker/ElfLinker.cpp
@@ -72,11 +72,6 @@ ElfLinkerImpl::ElfLinkerImpl(PipelineState *pipelineState, ArrayRef<MemoryBuffer
     addInputElf(elf, /*addAtStart=*/false);
 }
 
-// =====================================================================================================================
-// Destructor
-ElfLinkerImpl::~ElfLinkerImpl() {
-}
-
 // =====================================================================================================================
 // Add another input ELF to the link.
 void ElfLinkerImpl::addInputElf(MemoryBufferRef inputElf, bool addAtStart) {
diff --git a/lgc/elfLinker/ElfLinkerImpl.h b/lgc/elfLinker/ElfLinkerImpl.h
index 5d999986d9..2f91ffce75 100644
--- a/lgc/elfLinker/ElfLinkerImpl.h
+++ b/lgc/elfLinker/ElfLinkerImpl.h
@@ -126,9 +126,6 @@ class ElfLinkerImpl final : public lgc::ElfLinker {
   // Constructor given PipelineState and ELFs to link
   ElfLinkerImpl(lgc::PipelineState *pipelineState, llvm::ArrayRef<llvm::MemoryBufferRef> elfs);
 
-  // Destructor
-  ~ElfLinkerImpl() override final;
-
   // -----------------------------------------------------------------------------------------------------------------
   // Implementations of ElfLinker methods exposed to the front-end
 
diff --git a/lgc/elfLinker/GlueShader.h b/lgc/elfLinker/GlueShader.h
index 4bb2a42c15..6d4b8c102e 100644
--- a/lgc/elfLinker/GlueShader.h
+++ b/lgc/elfLinker/GlueShader.h
@@ -43,7 +43,7 @@ class LgcContext;
 // Base class for a glue shader (a fetch shader or parameter/color export shader generated during linking)
 class GlueShader {
 public:
-  virtual ~GlueShader() {}
+  virtual ~GlueShader() = default;
 
   // Create a color export shader
   static std::unique_ptr<GlueShader> createColorExportShader(PipelineState *pipelineState,
diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h
index b1fd73c612..7603c29cf8 100644
--- a/lgc/include/lgc/builder/BuilderImpl.h
+++ b/lgc/include/lgc/builder/BuilderImpl.h
@@ -312,6 +312,10 @@ class BuilderImpl : public BuilderDefs {
   // Build buffer compact descriptor
   llvm::Value *buildBufferCompactDesc(llvm::Value *desc, unsigned stride);
 
+  // Build image sampler feedback descriptor
+  llvm::Value *CreateSamplerFeedbackDesc(llvm::Value *feedbackDesc, llvm::Value *resourceDesc,
+                                         const llvm::Twine &instName = "");
+
 private:
   // Get a struct containing the pointer and byte stride for a descriptor
   llvm::Value *getDescPtrAndStride(ResourceNodeType resType, uint64_t descSet, unsigned binding,
@@ -791,10 +795,14 @@ class BuilderImpl : public BuilderDefs {
   uint16_t getDsSwizzleQuadMode(uint8_t lane0, uint8_t lane1, uint8_t lane2, uint8_t lane3);
 
   llvm::Value *createGroupBallot(llvm::Value *const value);
+  // Create a traditional loop for subgroup shuffle.
+  llvm::Value *createShuffleLoop(llvm::Value *const value, llvm::Value *const index, ShaderStageEnum shaderStage,
+                                 const llvm::Twine &instName = "");
 
 protected:
   // The subgroup operation with explicit shader stage as parameter.
   llvm::Value *createFindMsb(llvm::Value *const mask);
+  llvm::Value *createGroupBallotAllActive(llvm::Value *const value);
   llvm::Value *createGroupBallot(llvm::Value *const value, ShaderStageEnum shaderStage);
   llvm::Value *createSubgroupBroadcastFirst(llvm::Value *const value, ShaderStageEnum shaderStage,
                                             const llvm::Twine &instName);
diff --git a/lgc/include/lgc/patch/PatchLoopMetadata.h b/lgc/include/lgc/patch/AddLoopMetadata.h
similarity index 99%
rename from lgc/include/lgc/patch/PatchLoopMetadata.h
rename to lgc/include/lgc/patch/AddLoopMetadata.h
index b547dc7910..afe46f16ca 100644
--- a/lgc/include/lgc/patch/PatchLoopMetadata.h
+++ b/lgc/include/lgc/patch/AddLoopMetadata.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchLoopMetadata.h
+ * @file  AddLoopMetadata.h
  * @brief LLPC header file: contains declaration of class lgc::PatchLoopMetadata.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/PatchCheckShaderCache.h b/lgc/include/lgc/patch/CheckShaderCache.h
similarity index 89%
rename from lgc/include/lgc/patch/PatchCheckShaderCache.h
rename to lgc/include/lgc/patch/CheckShaderCache.h
index eb454f6461..5d31f625a6 100644
--- a/lgc/include/lgc/patch/PatchCheckShaderCache.h
+++ b/lgc/include/lgc/patch/CheckShaderCache.h
@@ -24,8 +24,8 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchCheckShaderCache.h
- * @brief LLPC header file: contains declaration of class lgc::PatchCheckShaderCache
+ * @file  CheckShaderCache.h
+ * @brief LLPC header file: contains declaration of class lgc::CheckShaderCache
  ***********************************************************************************************************************
  */
 #pragma once
@@ -38,11 +38,11 @@ namespace lgc {
 
 // =====================================================================================================================
 // Represents the pass of LLVM patching operations for checking shader cache
-class PatchCheckShaderCache : public Patch, public llvm::PassInfoMixin<PatchCheckShaderCache> {
+class CheckShaderCache : public Patch, public llvm::PassInfoMixin<CheckShaderCache> {
 public:
-  PatchCheckShaderCache() {}
+  CheckShaderCache() {}
 
-  PatchCheckShaderCache(Pipeline::CheckShaderCacheFunc callbackFunc);
+  CheckShaderCache(Pipeline::CheckShaderCacheFunc callbackFunc);
 
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/PatchImageOpCollect.h b/lgc/include/lgc/patch/CollectImageOperations.h
similarity index 98%
rename from lgc/include/lgc/patch/PatchImageOpCollect.h
rename to lgc/include/lgc/patch/CollectImageOperations.h
index b124ae15da..d4f563f37b 100644
--- a/lgc/include/lgc/patch/PatchImageOpCollect.h
+++ b/lgc/include/lgc/patch/CollectImageOperations.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchImageOpCollect.h
+ * @file  CollectImageOperations.h
  * @brief LLPC header file: contains declaration of class lgc::PatchImageOpCollect.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/FragColorExport.h b/lgc/include/lgc/patch/FragColorExport.h
index c7e949f547..1fcd756226 100644
--- a/lgc/include/lgc/patch/FragColorExport.h
+++ b/lgc/include/lgc/patch/FragColorExport.h
@@ -66,6 +66,7 @@ class FragColorExport {
     unsigned expFmt[MaxColorTargets];           // Export format used for "export" instruction.
     unsigned waveSize;                          // The wave size for fragment.
     bool enableFragColor;                       // Whether to broadcast frag color. Only for OGLP
+    ExportFormat dummyExpFmt;                   // Export format used for dummy "export" instruction.
   };
 
   FragColorExport(LgcContext *context);
@@ -74,7 +75,6 @@ class FragColorExport {
                                   bool dummyExport, PalMetadata *palMetadata, BuilderBase &builder,
                                   llvm::Value *dynamicIsDualSource, const Key &key);
   static void setDoneFlag(llvm::Value *exportInst, BuilderBase &builder);
-  static llvm::CallInst *addDummyExport(BuilderBase &builder);
   static llvm::Function *generateNullFragmentShader(llvm::Module &module, PipelineState *pipelineState,
                                                     llvm::StringRef entryPointName);
   static llvm::Function *generateNullFragmentEntryPoint(llvm::Module &module, PipelineState *pipelineState,
diff --git a/lgc/include/lgc/patch/PatchCopyShader.h b/lgc/include/lgc/patch/GenerateCopyShader.h
similarity index 94%
rename from lgc/include/lgc/patch/PatchCopyShader.h
rename to lgc/include/lgc/patch/GenerateCopyShader.h
index f52784a094..a01372dd45 100644
--- a/lgc/include/lgc/patch/PatchCopyShader.h
+++ b/lgc/include/lgc/patch/GenerateCopyShader.h
@@ -24,8 +24,8 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchCopyShader.h
- * @brief LLPC header file: contains declaration of class lgc::PatchCopyShader.
+ * @file  GenerateCopyShader.h
+ * @brief LLPC header file: contains declaration of class lgc::GenerateCopyShader.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -40,7 +40,7 @@ namespace lgc {
 
 // =====================================================================================================================
 // Pass to generate copy shader if required
-class PatchCopyShader : public Patch, public llvm::PassInfoMixin<PatchCopyShader> {
+class GenerateCopyShader : public Patch, public llvm::PassInfoMixin<GenerateCopyShader> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/PatchLlvmIrInclusion.h b/lgc/include/lgc/patch/IncludeLlvmIr.h
similarity index 98%
rename from lgc/include/lgc/patch/PatchLlvmIrInclusion.h
rename to lgc/include/lgc/patch/IncludeLlvmIr.h
index 1a253cabb9..b198d2c60a 100644
--- a/lgc/include/lgc/patch/PatchLlvmIrInclusion.h
+++ b/lgc/include/lgc/patch/IncludeLlvmIr.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchLlvmIrInclusion.h
+ * @file  IncludeLlvmIr.h
  * @brief LLPC header file: contains declaration of class lgc::PatchLlvmIrInclusion.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/LowerGpuRt.h b/lgc/include/lgc/patch/LowerGpuRt.h
index 5876d01cd7..c11e6a5cb8 100644
--- a/lgc/include/lgc/patch/LowerGpuRt.h
+++ b/lgc/include/lgc/patch/LowerGpuRt.h
@@ -66,13 +66,14 @@ class GpurtInitStaticIdOp;
 class LowerGpuRt : public llvm::PassInfoMixin<LowerGpuRt> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
+  void updateWorkgroupSize(llvm::Function *func);
 
 private:
   typedef void (LowerGpuRt::*LibraryFuncPtr)(llvm::Function *, unsigned);
   const static unsigned MaxLdsStackEntries = 16;
-  uint32_t getWorkgroupSize() const;
+  unsigned getWorkgroupSize(llvm::Function *func) const;
   llvm::Value *getThreadIdInGroup() const;
-  void createGlobalStack(llvm::Module &module);
+  void createLdsStack(llvm::Module &module);
   void createRayStaticIdValue();
   void visitGetStackSize(lgc::GpurtGetStackSizeOp &inst);
   void visitGetStackBase(lgc::GpurtGetStackBaseOp &inst);
@@ -100,5 +101,6 @@ class LowerGpuRt : public llvm::PassInfoMixin<LowerGpuRt> {
   llvm::SmallSet<llvm::Function *, 4> m_funcsToLower;    // Functions to lower
   Builder *m_builder = nullptr;
   unsigned m_rayStaticId = 0;
+  unsigned m_workGroupSize = 0;
 };
 } // namespace lgc
diff --git a/lgc/include/lgc/patch/PatchImageDerivatives.h b/lgc/include/lgc/patch/LowerImageDerivatives.h
similarity index 92%
rename from lgc/include/lgc/patch/PatchImageDerivatives.h
rename to lgc/include/lgc/patch/LowerImageDerivatives.h
index 7af727cfaf..2f1c024fa7 100644
--- a/lgc/include/lgc/patch/PatchImageDerivatives.h
+++ b/lgc/include/lgc/patch/LowerImageDerivatives.h
@@ -24,8 +24,8 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchImageDerivatives.h
- * @brief LLPC header file: contains declaration of class lgc::PatchImageDerivatives.
+ * @file  .h
+ * @brief LLPC header file: contains declaration of class lgc::LowerImageDerivatives.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -38,7 +38,7 @@ namespace lgc {
 
 // =====================================================================================================================
 // Represents the pass of LLVM patching operations for image operations
-class PatchImageDerivatives : public llvm::PassInfoMixin<PatchImageDerivatives> {
+class LowerImageDerivatives : public llvm::PassInfoMixin<LowerImageDerivatives> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/PatchInOutImportExport.h b/lgc/include/lgc/patch/LowerInOut.h
similarity index 99%
rename from lgc/include/lgc/patch/PatchInOutImportExport.h
rename to lgc/include/lgc/patch/LowerInOut.h
index 22f4c1672f..e165006e49 100644
--- a/lgc/include/lgc/patch/PatchInOutImportExport.h
+++ b/lgc/include/lgc/patch/LowerInOut.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchInOutImportExport.h
+ * @file  LowerInOut.h
  * @brief LLPC header file: contains declaration of class lgc::PatchInOutImportExport.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/PatchInvariantLoads.h b/lgc/include/lgc/patch/LowerInvariantLoads.h
similarity index 89%
rename from lgc/include/lgc/patch/PatchInvariantLoads.h
rename to lgc/include/lgc/patch/LowerInvariantLoads.h
index b241621c97..f3180ca0cc 100644
--- a/lgc/include/lgc/patch/PatchInvariantLoads.h
+++ b/lgc/include/lgc/patch/LowerInvariantLoads.h
@@ -24,8 +24,8 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchInvariantLoads.h
- * @brief LLPC header file: contains declaration of class lgc::PatchInvariantLoads.
+ * @file  LowerInvariantLoads.h
+ * @brief LLPC header file: contains declaration of class lgc::LowerInvariantLoads.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -37,8 +37,8 @@
 namespace lgc {
 
 // =====================================================================================================================
-// Represents the pass of LLVM patching operations for image operations
-class PatchInvariantLoads : public llvm::PassInfoMixin<PatchInvariantLoads> {
+// Represents the LLVM pass for patching operations to lower invariant loads
+class LowerInvariantLoads : public llvm::PassInfoMixin<LowerInvariantLoads> {
 public:
   llvm::PreservedAnalyses run(llvm::Function &function, llvm::FunctionAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/PatchMulDx9Zero.h b/lgc/include/lgc/patch/LowerMulDx9Zero.h
similarity index 99%
rename from lgc/include/lgc/patch/PatchMulDx9Zero.h
rename to lgc/include/lgc/patch/LowerMulDx9Zero.h
index 47861db8df..400c447187 100644
--- a/lgc/include/lgc/patch/PatchMulDx9Zero.h
+++ b/lgc/include/lgc/patch/LowerMulDx9Zero.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchMulDx9Zero.h
+ * @file  LowerMulDx9Zero.h
  * @brief LLPC header file: contains declaration of class lgc::PatchMulDx9Zero.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/PatchEntryPointMutate.h b/lgc/include/lgc/patch/MutateEntryPoint.h
similarity index 96%
rename from lgc/include/lgc/patch/PatchEntryPointMutate.h
rename to lgc/include/lgc/patch/MutateEntryPoint.h
index e56338b3ef..639f3fca53 100644
--- a/lgc/include/lgc/patch/PatchEntryPointMutate.h
+++ b/lgc/include/lgc/patch/MutateEntryPoint.h
@@ -24,8 +24,8 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchEntryPointMutate.h
- * @brief LLPC header file: contains declaration of class lgc::PatchEntryPointMutate.
+ * @file  MutateEntryPoint.h
+ * @brief LLPC header file: contains declaration of class lgc::MutateEntryPoint.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -48,9 +48,9 @@ class UserDataOp;
 
 // =====================================================================================================================
 // The entry-point mutation pass
-class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin<PatchEntryPointMutate> {
+class MutateEntryPoint : public Patch, public llvm::PassInfoMixin<MutateEntryPoint> {
 public:
-  PatchEntryPointMutate();
+  MutateEntryPoint();
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
   static llvm::StringRef name() { return "Patch LLVM for entry-point mutation"; }
@@ -86,7 +86,7 @@ class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin<PatchEntr
 
   // Per-merged-shader-stage gathered user data usage information.
   struct UserDataUsage {
-    // Check if special user data value is used by lgc.special.user.data call generated before PatchEntryPointMutate
+    // Check if special user data value is used by lgc.special.user.data call generated before MutateEntryPoint
     bool isSpecialUserDataUsed(UserDataMapping kind);
     void addLoad(unsigned dwordOffset, unsigned dwordSize);
 
@@ -177,7 +177,7 @@ class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin<PatchEntr
   PipelineState *m_pipelineState = nullptr; // Pipeline state from PipelineStateWrapper pass
   bool m_computeWithCalls = false;          // Whether this is compute pipeline with calls or compute library
   // Per-HW-shader-stage gathered user data usage information.
-  llvm::SmallVector<std::unique_ptr<UserDataUsage>, ShaderStage::Count> m_userDataUsage;
+  ShaderStageMap<std::unique_ptr<UserDataUsage>> m_userDataUsage;
 
   class CpsShaderInputCache {
   public:
diff --git a/lgc/include/lgc/patch/Patch.h b/lgc/include/lgc/patch/Patch.h
index f78951b524..c938e26da2 100644
--- a/lgc/include/lgc/patch/Patch.h
+++ b/lgc/include/lgc/patch/Patch.h
@@ -49,8 +49,8 @@ class PassManager;
 // Represents the pass of LLVM patching operations, as the base class.
 class Patch {
 public:
-  Patch() : m_module(nullptr), m_context(nullptr), m_shaderStage(ShaderStage::Invalid), m_entryPoint(nullptr) {}
-  virtual ~Patch() {}
+  Patch() : m_module(nullptr), m_context(nullptr), m_entryPoint(nullptr) {}
+  virtual ~Patch() = default;
 
   static void addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, llvm::Timer *patchTimer,
                         llvm::Timer *optTimer, Pipeline::CheckShaderCacheFunc checkShaderCacheFunc, uint32_t optLevel);
@@ -68,10 +68,10 @@ class Patch {
 
   void init(llvm::Module *module);
 
-  llvm::Module *m_module;        // LLVM module to be run on
-  llvm::LLVMContext *m_context;  // Associated LLVM context of the LLVM module that passes run on
-  ShaderStageEnum m_shaderStage; // Shader stage
-  llvm::Function *m_entryPoint;  // Entry-point
+  llvm::Module *m_module;                       // LLVM module to be run on
+  llvm::LLVMContext *m_context;                 // Associated LLVM context of the LLVM module that passes run on
+  std::optional<ShaderStageEnum> m_shaderStage; // Shader stage
+  llvm::Function *m_entryPoint;                 // Entry-point
 };
 
 } // namespace lgc
diff --git a/lgc/include/lgc/patch/PatchLoadScalarizer.h b/lgc/include/lgc/patch/ScalarizeLoads.h
similarity index 98%
rename from lgc/include/lgc/patch/PatchLoadScalarizer.h
rename to lgc/include/lgc/patch/ScalarizeLoads.h
index f29ed9e4c4..670667e58f 100644
--- a/lgc/include/lgc/patch/PatchLoadScalarizer.h
+++ b/lgc/include/lgc/patch/ScalarizeLoads.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchLoadScalarizer.h
+ * @file  ScalarizeLoads.h
  * @brief LLPC header file: contains declaration of class lgc::PatchLoadScalarizer.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/ShaderInputs.h b/lgc/include/lgc/patch/ShaderInputs.h
index 390cc43699..d7b139a53f 100644
--- a/lgc/include/lgc/patch/ShaderInputs.h
+++ b/lgc/include/lgc/patch/ShaderInputs.h
@@ -145,12 +145,12 @@ enum class ShaderInput : unsigned {
 // =====================================================================================================================
 // Class for handling shader inputs (other than user data)
 //
-// From BuilderImpl up to just before PatchEntryPointMutate, static methods in this class can be used to
+// From BuilderImpl up to just before MutateEntryPoint, static methods in this class can be used to
 // generate code to access shader inputs. That generates an lgc.shader.input.* call for each access.
 //
-// The PatchEntryPointMutate pass creates a ShaderInputs object, and uses a method on it to gather already-
+// The MutateEntryPoint pass creates a ShaderInputs object, and uses a method on it to gather already-
 // generated uses of shader inputs, and another method to create arguments for the shader function based
-// on that, and on usage that will happen after PatchEntryPointMutate.
+// on that, and on usage that will happen after MutateEntryPoint.
 //
 // The resulting shader function has input arguments that represent a kind of idealized GFX8 shader,
 // before GFX9+ shader merging and/or GFX10+ NGG primitive shader formation.
@@ -173,7 +173,7 @@ class ShaderInputs {
   static const char *getInputName(ShaderInput inputKind);
 
   // -------------------------------------------------------------------------------------------------------------------
-  // Static methods called before PatchEntryPointMutate
+  // Static methods called before MutateEntryPoint
 
   // Get a special user data value by inserting a call to lgc.special.user.data
   static llvm::CallInst *getSpecialUserData(UserDataMapping kind, BuilderBase &builder);
@@ -191,9 +191,9 @@ class ShaderInputs {
   static llvm::Value *getInput(ShaderInput kind, BuilderBase &builder, const LgcContext &lgcContext);
 
   // -------------------------------------------------------------------------------------------------------------------
-  // Object methods called during PatchEntryPointMutate
+  // Object methods called during MutateEntryPoint
 
-  // Gather usage of shader inputs from before PatchEntryPointMutate
+  // Gather usage of shader inputs from before MutateEntryPoint
   void gatherUsage(llvm::Module &module);
 
   // Fix up uses of shader inputs to use entry args directly
@@ -230,7 +230,7 @@ class ShaderInputs {
   // amdgpu-no-workgroup-id-*
   void tryOptimizeWorkgroupId(PipelineState *pipelineState, ShaderStageEnum shaderStage, llvm::Function *origFunc);
 
-  llvm::SmallVector<ShaderInputsUsage, ShaderStage::CountInternal> m_shaderInputsUsage;
+  ShaderStageMap<ShaderInputsUsage> m_shaderInputsUsage;
 };
 
 } // namespace lgc
diff --git a/lgc/include/lgc/patch/VertexFetch.h b/lgc/include/lgc/patch/VertexFetch.h
index 8f56728457..83d7b47314 100644
--- a/lgc/include/lgc/patch/VertexFetch.h
+++ b/lgc/include/lgc/patch/VertexFetch.h
@@ -43,7 +43,7 @@ class InputImportGenericOp;
 // Public interface to vertex fetch manager.
 class VertexFetch {
 public:
-  virtual ~VertexFetch() {}
+  virtual ~VertexFetch() = default;
 
   // Create a VertexFetch
   static VertexFetch *create(LgcContext *lgcContext, bool useSoftwareVertexBufferDescriptors,
diff --git a/lgc/include/lgc/state/PipelineShaders.h b/lgc/include/lgc/state/PipelineShaders.h
index 6c5fb59af7..7a9dfe0783 100644
--- a/lgc/include/lgc/state/PipelineShaders.h
+++ b/lgc/include/lgc/state/PipelineShaders.h
@@ -46,7 +46,7 @@ class PipelineShadersResult {
   std::optional<ShaderStageEnum> getShaderStage(const llvm::Function *func) const;
 
 private:
-  llvm::Function *m_entryPoints[ShaderStage::CountInternal];         // The entry-point for each shader stage.
+  ShaderStageMap<llvm::Function *> m_entryPoints;                    // The entry-point for each shader stage.
   std::map<const llvm::Function *, ShaderStageEnum> m_entryPointMap; // Map from shader entry-point to shader stage.
 };
 
diff --git a/lgc/include/lgc/state/PipelineState.h b/lgc/include/lgc/state/PipelineState.h
index 0515968f17..b4375d5f85 100644
--- a/lgc/include/lgc/state/PipelineState.h
+++ b/lgc/include/lgc/state/PipelineState.h
@@ -254,6 +254,7 @@ class PipelineState final : public Pipeline {
   // Accessors for shader stage mask
   ShaderStageMask getShaderStageMask();
   bool getPreRasterHasGs() const { return m_preRasterHasGs; }
+
   bool hasShaderStage(ShaderStageEnum stage) { return getShaderStageMask().contains(stage); }
   bool isGraphics();
   bool isComputeLibrary() const { return m_computeLibrary; }
@@ -274,7 +275,7 @@ class PipelineState final : public Pipeline {
   llvm::ArrayRef<ResourceNode> getUserDataNodes() const { return m_userDataNodes; }
 
   // Find the push constant resource node
-  const ResourceNode *findPushConstantResourceNode(std::optional<ShaderStageEnum> shaderStage = std::nullopt) const;
+  const ResourceNode *findPushConstantResourceNode(std::optional<ShaderStageEnum> shaderStage) const;
 
   // Find the resource node for the given set,binding
   std::pair<const ResourceNode *, const ResourceNode *>
@@ -282,7 +283,8 @@ class PipelineState final : public Pipeline {
                    std::optional<ShaderStageEnum> shaderStage = std::nullopt) const;
 
   // Find the single root resource node of the given type
-  const ResourceNode *findSingleRootResourceNode(ResourceNodeType nodeType, ShaderStageEnum shaderStage) const;
+  const ResourceNode *findSingleRootResourceNode(ResourceNodeType nodeType,
+                                                 std::optional<ShaderStageEnum> shaderStage) const;
 
   // Accessors for vertex input descriptions.
   llvm::ArrayRef<VertexInputDescription> getVertexInputDescriptions() const { return m_vertexInputDescriptions; }
@@ -302,7 +304,6 @@ class PipelineState final : public Pipeline {
 
   // Set GS on-chip mode
   void setGsOnChip(bool gsOnChip) { m_gsOnChip = gsOnChip; }
-
   // Checks whether GS on-chip mode is enabled
   // NOTE: on GFX9, ES -> GS ring is always on-chip, GS on-chip mode means GS -> VS
   // ring is on-chip.
@@ -449,7 +450,12 @@ class PipelineState final : public Pipeline {
   }
 
   // Get user data for a specific shader stage
-  llvm::ArrayRef<unsigned> getUserDataMap(ShaderStageEnum shaderStage) const { return m_userDataMaps[shaderStage]; }
+  llvm::ArrayRef<unsigned> getUserDataMap(ShaderStageEnum shaderStage) const {
+    auto it = m_userDataMaps.find(shaderStage);
+    if (it != m_userDataMaps.end())
+      return it->second;
+    return {};
+  }
 
   // Set spill_threshold for a specific shader stage
   void setSpillThreshold(ShaderStageEnum shaderStage, unsigned spillThreshold) {
@@ -583,7 +589,7 @@ class PipelineState final : public Pipeline {
   llvm::ArrayRef<llvm::MDString *> getResourceTypeNames();
   llvm::MDString *getResourceTypeName(ResourceNodeType type);
   ResourceNodeType getResourceTypeFromName(llvm::MDString *typeName);
-  bool matchResourceNode(const ResourceNode &node, ResourceNodeType nodeType, uint64_t descSet, unsigned binding) const;
+  bool matchResourceNode(const ResourceNode &node, uint64_t descSet, unsigned binding) const;
 
   // Device index handling
   void recordDeviceIndex(llvm::Module *module);
@@ -613,7 +619,7 @@ class PipelineState final : public Pipeline {
   bool m_computeLibrary = false;                        // Whether pipeline is in fact a compute library
   std::string m_client;                                 // Client name for PAL metadata
   Options m_options = {};                               // Per-pipeline options
-  std::vector<ShaderOptions> m_shaderOptions;           // Per-shader options
+  ShaderStageMap<ShaderOptions> m_shaderOptions;        // Per-shader options
   std::unique_ptr<ResourceNode[]> m_allocUserDataNodes; // Allocated buffer for user data
   llvm::ArrayRef<ResourceNode> m_userDataNodes;         // Top-level user data node table
   // Cached MDString for each resource node type
@@ -621,27 +627,27 @@ class PipelineState final : public Pipeline {
   // Allocated buffers for immutable sampler data
   llvm::SmallVector<std::unique_ptr<uint32_t[]>, 4> m_immutableValueAllocs;
 
-  bool m_gsOnChip = false;                                                       // Whether to use GS on-chip mode
-  bool m_meshRowExport = false;                                                  // Enable mesh shader row export or not
-  NggControl m_nggControl = {};                                                  // NGG control settings
-  ShaderModes m_shaderModes;                                                     // Shader modes for this pipeline
-  unsigned m_deviceIndex = 0;                                                    // Device index
-  std::vector<VertexInputDescription> m_vertexInputDescriptions;                 // Vertex input descriptions
-  llvm::SmallVector<ColorExportFormat, 8> m_colorExportFormats;                  // Color export formats
-  ColorExportState m_colorExportState = {};                                      // Color export state
-  InputAssemblyState m_inputAssemblyState = {};                                  // Input-assembly state
-  RasterizerState m_rasterizerState = {};                                        // Rasterizer state
-  DepthStencilState m_depthStencilState = {};                                    // Depth/stencil state
-  std::unique_ptr<ResourceUsage> m_resourceUsage[ShaderStage::Compute + 1] = {}; // Per-shader ResourceUsage
-  std::unique_ptr<InterfaceData> m_interfaceData[ShaderStage::Compute + 1] = {}; // Per-shader InterfaceData
-  PalMetadata *m_palMetadata = nullptr;                                          // PAL metadata object
-  unsigned m_waveSize[ShaderStage::CountInternal] = {};                          // Per-shader wave size
-  unsigned m_subgroupSize[ShaderStage::CountInternal] = {};                      // Per-shader subgroup size
-  bool m_inputPackState[ShaderStage::GfxCount] = {};  // The input packable state per shader stage
-  bool m_outputPackState[ShaderStage::GfxCount] = {}; // The output packable state per shader stage
-  XfbStateMetadata m_xfbStateMetadata = {};           // Transform feedback state metadata
-  llvm::SmallVector<unsigned, 32> m_userDataMaps[ShaderStage::CountInternal]; // The user data per-shader
-  unsigned m_shaderSpillThreshold[ShaderStage::CountInternal] = {};           // The spillThreshold per-shader
+  bool m_gsOnChip = false;                                          // Whether to use GS on-chip mode
+  bool m_meshRowExport = false;                                     // Enable mesh shader row export or not
+  NggControl m_nggControl = {};                                     // NGG control settings
+  ShaderModes m_shaderModes;                                        // Shader modes for this pipeline
+  unsigned m_deviceIndex = 0;                                       // Device index
+  std::vector<VertexInputDescription> m_vertexInputDescriptions;    // Vertex input descriptions
+  llvm::SmallVector<ColorExportFormat, 8> m_colorExportFormats;     // Color export formats
+  ColorExportState m_colorExportState = {};                         // Color export state
+  InputAssemblyState m_inputAssemblyState = {};                     // Input-assembly state
+  RasterizerState m_rasterizerState = {};                           // Rasterizer state
+  DepthStencilState m_depthStencilState = {};                       // Depth/stencil state
+  ShaderStageMap<std::unique_ptr<ResourceUsage>> m_resourceUsage;   // Per-shader ResourceUsage
+  ShaderStageMap<std::unique_ptr<InterfaceData>> m_interfaceData;   // Per-shader InterfaceData
+  PalMetadata *m_palMetadata = nullptr;                             // PAL metadata object
+  ShaderStageMap<unsigned> m_waveSize;                              // Per-shader wave size
+  ShaderStageMap<unsigned> m_subgroupSize;                          // Per-shader subgroup size
+  ShaderStageMap<bool> m_inputPackState;                            // The input packable state per shader stage
+  ShaderStageMap<bool> m_outputPackState;                           // The output packable state per shader stage
+  XfbStateMetadata m_xfbStateMetadata = {};                         // Transform feedback state metadata
+  ShaderStageMap<llvm::SmallVector<unsigned, 32>> m_userDataMaps;   // The user data per-shader
+  unsigned m_shaderSpillThreshold[ShaderStage::CountInternal] = {}; // The spillThreshold per-shader
 
   struct {
     float inner[2]; // default tessellation inner level
diff --git a/lgc/include/lgc/state/ResourceUsage.h b/lgc/include/lgc/state/ResourceUsage.h
index 5f617e8a99..e8e946e3af 100644
--- a/lgc/include/lgc/state/ResourceUsage.h
+++ b/lgc/include/lgc/state/ResourceUsage.h
@@ -639,7 +639,7 @@ struct InterfaceData {
     };
 
     bool initialized; // Whether entryArgIdxs has been initialized
-                      //   by PatchEntryPointMutate
+                      //   by MutateEntryPoint
   } entryArgIdxs = {};
 
   InterfaceData();
diff --git a/lgc/include/lgc/state/ShaderModes.h b/lgc/include/lgc/state/ShaderModes.h
index d297b35914..ecd1653d18 100644
--- a/lgc/include/lgc/state/ShaderModes.h
+++ b/lgc/include/lgc/state/ShaderModes.h
@@ -107,12 +107,12 @@ class ShaderModes {
   void readModesFromPipeline(llvm::Module *module);
 
 private:
-  CommonShaderMode m_commonShaderModes[ShaderStage::Compute + 1] = {}; // Per-shader FP modes
-  TessellationMode m_tessellationMode = {};                            // Tessellation mode
-  GeometryShaderMode m_geometryShaderMode = {};                        // Geometry shader mode
-  MeshShaderMode m_meshShaderMode = {};                                // Mesh shader mode
-  FragmentShaderMode m_fragmentShaderMode = {};                        // Fragment shader mode
-  ComputeShaderMode m_computeShaderMode = {};                          // Compute shader mode (workgroup size)
+  ShaderStageMap<CommonShaderMode> m_commonShaderModes; // Per-shader FP mode
+  TessellationMode m_tessellationMode = {};             // Tessellation mode
+  GeometryShaderMode m_geometryShaderMode = {};         // Geometry shader mode
+  MeshShaderMode m_meshShaderMode = {};                 // Mesh shader mode
+  FragmentShaderMode m_fragmentShaderMode = {};         // Fragment shader mode
+  ComputeShaderMode m_computeShaderMode = {};           // Compute shader mode (workgroup size)
 };
 
 } // namespace lgc
diff --git a/lgc/include/lgc/util/GfxRegHandler.h b/lgc/include/lgc/util/GfxRegHandler.h
index 541a153d2c..03e0241de4 100644
--- a/lgc/include/lgc/util/GfxRegHandler.h
+++ b/lgc/include/lgc/util/GfxRegHandler.h
@@ -61,7 +61,7 @@ struct BitsState {
 // =====================================================================================================================
 // Helper class for handling graphics registers.
 // Note: 1) Don't use GfxRegHandler directly, please implement your own register helper class, such as
-// SqImgSampRegHelper
+// SqImgSampRegHandler
 //       2) The ID (enum) used in this class is determined by BitsInfo
 //       3) The count of BisState used in this class is determined by BitsInfo
 // e.g.
@@ -154,6 +154,9 @@ enum class SqRsrcRegs {
   WidthLo,
   WidthHi,
   ArrayPitch,
+  MinLod,
+  MinLodLo,
+  MinLodHi,
 
   Count,
 };
diff --git a/lgc/interface/lgc/Builder.h b/lgc/interface/lgc/Builder.h
index f48bf0a7ad..fde589f64c 100644
--- a/lgc/interface/lgc/Builder.h
+++ b/lgc/interface/lgc/Builder.h
@@ -955,6 +955,14 @@ class Builder : public BuilderDefs {
   // @param instName : Name to give instruction(s)
   llvm::Value *CreateLoadPushConstantsPtr(const llvm::Twine &instName = "");
 
+  // Merges a resource descriptor into a feedback descriptor to create a descriptor for sampler feedback instructions.
+  //
+  // @param feedbackDesc : feedback descriptor
+  // @param resourceDesc : resource descriptor
+  // @param instName : Name to give instruction(s)
+  llvm::Value *CreateSamplerFeedbackDesc(llvm::Value *feedbackDesc, llvm::Value *resourceDesc,
+                                         const llvm::Twine &instName = "");
+
   // -----------------------------------------------------------------------------------------------------------------
   // Image operations
 
diff --git a/lgc/interface/lgc/CommonDefs.h b/lgc/interface/lgc/CommonDefs.h
index 4ae27eeabb..bc927c0859 100644
--- a/lgc/interface/lgc/CommonDefs.h
+++ b/lgc/interface/lgc/CommonDefs.h
@@ -235,6 +235,9 @@ enum class ResourceLayoutScheme : unsigned {
   Compact = 0, ///< Compact scheme make full use of all the user data registers.
   Indirect     ///< Fixed layout, push constant will be the sub node of DescriptorTableVaPtr
 };
+
+template <typename T>
+using ShaderStageMap = llvm::SmallDenseMap<ShaderStageEnum, T, llvm::NextPowerOf2(ShaderStages.size())>;
 } // namespace lgc
 
 namespace llvm {
diff --git a/lgc/interface/lgc/ElfLinker.h b/lgc/interface/lgc/ElfLinker.h
index e9114a17f2..d1b260c199 100644
--- a/lgc/interface/lgc/ElfLinker.h
+++ b/lgc/interface/lgc/ElfLinker.h
@@ -49,7 +49,7 @@ struct ColorExportInfo;
 // relocs.
 class ElfLinker {
 public:
-  virtual ~ElfLinker() {}
+  virtual ~ElfLinker() = default;
 
   // Add another input ELF to the link, in addition to the ones that were added when the ElfLinker was constructed.
   virtual void addInputElf(llvm::MemoryBufferRef inputElf) = 0;
diff --git a/lgc/interface/lgc/LgcDialect.h b/lgc/interface/lgc/LgcDialect.h
index 34b09b3a5b..facbaa2477 100644
--- a/lgc/interface/lgc/LgcDialect.h
+++ b/lgc/interface/lgc/LgcDialect.h
@@ -48,6 +48,8 @@ enum class CooperativeMatrixElementType : unsigned {
   Int32,         // 32 bit integer
   Float16Packed, // packed 16-bit floating-point
   BFloat16,      // 16-bit brain floating-point
+  Float8,        // 8-bit floating-point
+  BFloat8,       // 8-bit brain floating-point
 };
 
 // Layout is virtual concept, eg: 16bit and 32bit for matrixC will share the same layout initially.
diff --git a/lgc/interface/lgc/LgcDialect.td b/lgc/interface/lgc/LgcDialect.td
index 8f904b2be7..eb4fb0f792 100644
--- a/lgc/interface/lgc/LgcDialect.td
+++ b/lgc/interface/lgc/LgcDialect.td
@@ -889,8 +889,8 @@ def CooperativeMatrixTimesScalarOp : LgcOp<"cooperative.matrix.times.scalar", [C
 
 def CooperativeMatrixMulAddOp : LgcOp<"cooperative.matrix.muladd", [Convergent, WillReturn]> {
   let arguments = (ins value:$matrix_a, value:$matrix_b, value:$matrix_c, AttrI1:$is_signed_a, AttrI1:$is_signed_b,
-                   AttrI1:$is_sat_or_opsel, AttrI1:$is_tied, CooperativeMatrixElementType:$accu_elem_type,
-                   CooperativeMatrixElementType:$factor_elem_type);
+                   AttrI1:$is_sat_or_opsel, AttrI1:$is_tied, CooperativeMatrixElementType:$matrix_a_elem_type,
+                   CooperativeMatrixElementType:$matrix_b_elem_type, CooperativeMatrixElementType:$matrix_c_elem_type);
   let results = (outs value:$result);
 
   let defaultBuilderHasExplicitResultType = true;
@@ -913,8 +913,9 @@ def CooperativeMatrixMulAddOp : LgcOp<"cooperative.matrix.muladd", [Convergent,
     'is_tied' is the flag of the output matrix has to be the same
     as the input accumulator (i.e., D has to be C)
 
-    'accu_elem_type' is the component type of the accumulator matrix.
-    'factor_elem_type' is the component type of the factor matrix.
+    '$matrix_a_elem_type' is the component type of the matrix A
+    '$matrix_b_elem_type' is the component type of the matrix B
+    '$matrix_c_elem_type' is the component type of the matrix C
   }];
 }
 
diff --git a/lgc/interface/lgc/PassManager.h b/lgc/interface/lgc/PassManager.h
index fa068a321d..c91b866e38 100644
--- a/lgc/interface/lgc/PassManager.h
+++ b/lgc/interface/lgc/PassManager.h
@@ -48,7 +48,7 @@ class LgcContext;
 class LegacyPassManager : public llvm::legacy::PassManager {
 public:
   static LegacyPassManager *Create();
-  virtual ~LegacyPassManager() {}
+  virtual ~LegacyPassManager() = default;
   virtual void stop() = 0;
   virtual void setPassIndex(unsigned *passIndex) = 0;
 };
diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h
index 5f6e9ce437..bca1917eae 100644
--- a/lgc/interface/lgc/Pipeline.h
+++ b/lgc/interface/lgc/Pipeline.h
@@ -335,7 +335,7 @@ struct ResourceNode {
 
   ResourceNodeType concreteType; // Underlying actual type of this node
   ResourceNodeType abstractType; // Node type for resource node matching
-  unsigned visibility;           // Visibility bitmap: bit N set means entry is visible to ShaderStageEnum(N); value 0
+  ShaderStageMask visibility;    // Visibility bitmap: entry is visible to the shader stages in the mask; empty mask
                                  //  means visible to all shader stages
   unsigned sizeInDwords;         // Size in dwords
   unsigned offsetInDwords;       // Offset in dwords
@@ -741,7 +741,7 @@ class Pipeline {
 public:
   Pipeline(LgcContext *builderContext) : m_builderContext(builderContext) {}
 
-  virtual ~Pipeline() {}
+  virtual ~Pipeline() = default;
 
   // Get LgcContext
   LgcContext *getLgcContext() const { return m_builderContext; }
diff --git a/lgc/interface/lgc/RayTracingLibrarySummary.h b/lgc/interface/lgc/RayTracingLibrarySummary.h
index 9c65fd32ba..6c56d7ecf9 100644
--- a/lgc/interface/lgc/RayTracingLibrarySummary.h
+++ b/lgc/interface/lgc/RayTracingLibrarySummary.h
@@ -37,6 +37,7 @@
  */
 #pragma once
 
+#include "llvmraytracing/PipelineState.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 
@@ -59,10 +60,6 @@ struct RayTracingLibrarySummary {
   // attributes (no AHS/IS/CHS).
   unsigned maxHitAttributeSize = 0;
 
-  // The maximum occurring number of payload registers in the pipeline, which will be taken into account for Traversal
-  // module so that it sees the correct maximum payload size of a pipeline.
-  unsigned maxUsedPayloadRegisterCount = 0;
-
   // Whether a kernel entry function was built for this library.
   bool hasKernelEntry = false;
 
@@ -76,6 +73,9 @@ struct RayTracingLibrarySummary {
   // flags).
   bool hasTraceRayModule = false;
 
+  // Opaque state owned by the llvmraytracing middle-end.
+  llvmraytracing::PipelineState llvmRaytracingState;
+
   static llvm::Expected<RayTracingLibrarySummary> decodeMsgpack(llvm::StringRef data);
   std::string encodeMsgpack() const;
 
diff --git a/lgc/patch/PatchLoopMetadata.cpp b/lgc/patch/AddLoopMetadata.cpp
similarity index 99%
rename from lgc/patch/PatchLoopMetadata.cpp
rename to lgc/patch/AddLoopMetadata.cpp
index e6af095cb8..d8d79d6b0c 100644
--- a/lgc/patch/PatchLoopMetadata.cpp
+++ b/lgc/patch/AddLoopMetadata.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchLoopMetadata.cpp
+ * @file  AddLoopMetadata.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchLoopMetadata.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchLoopMetadata.h"
+#include "lgc/patch/AddLoopMetadata.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/lgc/patch/PatchCheckShaderCache.cpp b/lgc/patch/CheckShaderCache.cpp
similarity index 94%
rename from lgc/patch/PatchCheckShaderCache.cpp
rename to lgc/patch/CheckShaderCache.cpp
index 538f02a4d0..b8cc064322 100644
--- a/lgc/patch/PatchCheckShaderCache.cpp
+++ b/lgc/patch/CheckShaderCache.cpp
@@ -24,16 +24,16 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchCheckShaderCache.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchCheckShaderCache.
+ * @file  CheckShaderCache.cpp
+ * @brief LLPC source file: contains implementation of class lgc::CheckShaderCache.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchCheckShaderCache.h"
+#include "lgc/patch/CheckShaderCache.h"
 #include "lgc/CommonDefs.h"
 #include "lgc/state/PipelineShaders.h"
 #include "llvm/Support/Debug.h"
 
-#define DEBUG_TYPE "lgc-patch-check-shader-cache"
+#define DEBUG_TYPE "lgc-check-shader-cache"
 
 using namespace llvm;
 using namespace lgc;
@@ -57,7 +57,7 @@ template <class MapType> static void streamMapEntries(MapType &map, raw_ostream
 } // namespace
 
 // =====================================================================================================================
-PatchCheckShaderCache::PatchCheckShaderCache(Pipeline::CheckShaderCacheFunc callbackFunc)
+CheckShaderCache::CheckShaderCache(Pipeline::CheckShaderCacheFunc callbackFunc)
     : m_callbackFunc(std::move(callbackFunc)) {
 }
 
@@ -67,10 +67,10 @@ PatchCheckShaderCache::PatchCheckShaderCache(Pipeline::CheckShaderCacheFunc call
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchCheckShaderCache::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses CheckShaderCache::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
 
-  LLVM_DEBUG(dbgs() << "Run the pass Patch-Check-Shader-Cache\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Check-Shader-Cache\n");
 
   if (m_callbackFunc == nullptr) {
     // No shader cache in use.
diff --git a/lgc/patch/PatchImageOpCollect.cpp b/lgc/patch/CollectImageOperations.cpp
similarity index 97%
rename from lgc/patch/PatchImageOpCollect.cpp
rename to lgc/patch/CollectImageOperations.cpp
index dd316cbb88..069c3fe5c2 100644
--- a/lgc/patch/PatchImageOpCollect.cpp
+++ b/lgc/patch/CollectImageOperations.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchImageOpCollect.cpp
+ * @file  CollectImageOperations.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchImageOpCollect.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchImageOpCollect.h"
+#include "lgc/patch/CollectImageOperations.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/state/PipelineState.h"
 #include "llvm/InitializePasses.h"
diff --git a/lgc/patch/CombineCooperativeMatrix.cpp b/lgc/patch/CombineCooperativeMatrix.cpp
index 1b32e9753b..f1977df851 100644
--- a/lgc/patch/CombineCooperativeMatrix.cpp
+++ b/lgc/patch/CombineCooperativeMatrix.cpp
@@ -118,7 +118,7 @@ bool CooperativeMatrixCombiner::run() {
                                       [](auto &self, auto &op) { self.m_ops.push_back(&op); })
                                   .add<CooperativeMatrixMulAddOp>([](auto &self, auto &op) {
 #if !defined(LLVM_MAIN_REVISION) || LLVM_MAIN_REVISION >= 479080
-                                    auto accumElemType = op.getAccuElemType();
+                                    auto accumElemType = op.getMatrixCElemType();
                                     bool isPackable = accumElemType == CooperativeMatrixElementType::Float16;
                                     if ((self.m_gfxIpVersion.major == 11) && isPackable) {
                                       self.m_muladds[op.getParent()].push_back(&op);
diff --git a/lgc/patch/ConfigBuilderBase.cpp b/lgc/patch/ConfigBuilderBase.cpp
index 9302cc39df..0823ef161f 100644
--- a/lgc/patch/ConfigBuilderBase.cpp
+++ b/lgc/patch/ConfigBuilderBase.cpp
@@ -75,10 +75,6 @@ ConfigBuilderBase::ConfigBuilderBase(Module *module, PipelineState *pipelineStat
   setApiName(pipelineState->getClient());
 }
 
-// =====================================================================================================================
-ConfigBuilderBase::~ConfigBuilderBase() {
-}
-
 // =====================================================================================================================
 /// Adds the .shaders.$(apiStage).hardware_mapping node to the PAL metadata.
 /// Also add .shader_subtype if it is a compute shader.
diff --git a/lgc/patch/ConfigBuilderBase.h b/lgc/patch/ConfigBuilderBase.h
index 31c6c57acb..873a036270 100644
--- a/lgc/patch/ConfigBuilderBase.h
+++ b/lgc/patch/ConfigBuilderBase.h
@@ -60,7 +60,6 @@ struct PalMetadataNoteEntry {
 class ConfigBuilderBase {
 public:
   ConfigBuilderBase(llvm::Module *module, PipelineState *pipelineState);
-  ~ConfigBuilderBase();
 
   void writePalMetadata();
   llvm::msgpack::MapDocNode &getGraphicsRegNode() { return m_graphicsRegistersNode; }
@@ -130,7 +129,7 @@ class ConfigBuilderBase {
 
   llvm::msgpack::Document *m_document;      // The MsgPack document
   llvm::msgpack::MapDocNode m_pipelineNode; // MsgPack map node for amdpal.pipelines[0]
-  llvm::DenseMap<ShaderStageEnum, llvm::msgpack::MapDocNode> m_apiShaderNodes;
+  ShaderStageMap<llvm::msgpack::MapDocNode> m_apiShaderNodes;
   // MsgPack map node for each API shader's node in
   //  ".shaders"
   llvm::msgpack::MapDocNode m_hwShaderNodes[unsigned(Util::Abi::HardwareStage::Count)];
diff --git a/lgc/patch/FragColorExport.cpp b/lgc/patch/FragColorExport.cpp
index c8459c1514..52a5797573 100644
--- a/lgc/patch/FragColorExport.cpp
+++ b/lgc/patch/FragColorExport.cpp
@@ -755,26 +755,6 @@ void LowerFragColorExport::collectExportInfoForBuiltinOutput(Function *module, B
   m_exportValues[MaxColorTargets] = output;
 }
 
-// =====================================================================================================================
-// Generates a dummy export instruction.  Returns last export instruction that was generated.
-//
-// @param builder : The builder object that will be used to create new instructions.
-CallInst *FragColorExport::addDummyExport(BuilderBase &builder) {
-  auto zero = ConstantFP::get(builder.getFloatTy(), 0.0);
-  auto poison = PoisonValue::get(builder.getFloatTy());
-  Value *args[] = {
-      builder.getInt32(EXP_TARGET_MRT_0), // tgt
-      builder.getInt32(0x1),              // en
-      zero,                               // src0
-      poison,                             // src1
-      poison,                             // src2
-      poison,                             // src3
-      builder.getFalse(),                 // done
-      builder.getTrue()                   // vm
-  };
-  return builder.CreateIntrinsic(Intrinsic::amdgcn_exp, builder.getFloatTy(), args);
-}
-
 // =====================================================================================================================
 // Sets the done flag on the given export instruction.
 //
@@ -1061,9 +1041,35 @@ void FragColorExport::generateExportInstructions(ArrayRef<ColorExportInfo> info,
       }
     }
     if (!lastExport && dummyExport) {
-      lastExport = FragColorExport::addDummyExport(builder);
+      // NOTE: We maybe should not set SPI_SHADER_COL_FORMAT to 0 because of observe corruptions in some games.
+      // For performance, we must set the CB_SHADER_MASK to non-zero for RB+ optimization. In this case, PAL re-sets
+      // SPI_SHADER_COL_FORMAT to 32R, maybe causing a mismatch with CB_SHADER_MASK, there seems to be no impact on
+      // performance.
+      // For correctness, we should enable all channels enabled via the export format and write 0.
+      auto zero = ConstantFP::get(builder.getFloatTy(), 0.0);
+      auto zeros = ConstantVector::get({zero, zero, zero, zero});
+      const auto expFmt = key.dummyExpFmt == EXP_FORMAT_ZERO ? EXP_FORMAT_32_R : key.dummyExpFmt;
+      lastExport = handleColorExportInstructions(zeros, 0, builder, expFmt, false, false);
       palMetadata->setPsDummyExport();
-      finalExportFormats.push_back(EXP_FORMAT_32_R);
+      finalExportFormats.push_back(expFmt);
+      switch (expFmt) {
+      case EXP_FORMAT_32_R: {
+        cbShaderMask = 0x1U;
+        break;
+      }
+      case EXP_FORMAT_32_GR: {
+        cbShaderMask = 0x3U;
+        break;
+      }
+      case EXP_FORMAT_32_AR: {
+        cbShaderMask = 0x9U;
+        break;
+      }
+      default: {
+        cbShaderMask = 0xFU;
+        break;
+      }
+      }
     }
     if (lastExport)
       FragColorExport::setDoneFlag(lastExport, builder);
@@ -1154,6 +1160,8 @@ FragColorExport::Key FragColorExport::computeKey(ArrayRef<ColorExportInfo> infos
   key.enableFragColor = pipelineState->getOptions().enableFragColor;
   key.colorExportState = pipelineState->getColorExportState();
   key.waveSize = pipelineState->getShaderWaveSize(ShaderStage::Fragment);
+  key.dummyExpFmt = static_cast<ExportFormat>(
+      pipelineState->computeExportFormat(Type::getFloatTy(pipelineState->getContext()), 0, false));
 
   if (!infos.empty() && infos[0].hwColorTarget == MaxColorTargets) {
     infos = infos.drop_front(1);
diff --git a/lgc/patch/PatchCopyShader.cpp b/lgc/patch/GenerateCopyShader.cpp
similarity index 92%
rename from lgc/patch/PatchCopyShader.cpp
rename to lgc/patch/GenerateCopyShader.cpp
index b822fb5e94..ad4c50a5a1 100644
--- a/lgc/patch/PatchCopyShader.cpp
+++ b/lgc/patch/GenerateCopyShader.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchCopyShader.cpp
- * @brief LLPC source file: contains declaration and implementation of class lgc::PatchCopyShader.
+ * @file  GenerateCopyShader.cpp
+ * @brief LLPC source file: contains declaration and implementation of class lgc::GenerateCopyShader.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchCopyShader.h"
+#include "lgc/patch/GenerateCopyShader.h"
 #include "lgc/state/IntrinsDefs.h"
 #include "lgc/state/PalMetadata.h"
 #include "lgc/state/PipelineShaders.h"
@@ -43,7 +43,7 @@
 #include "llvm/Support/Debug.h"
 #include <set>
 
-#define DEBUG_TYPE "lgc-patch-copy-shader"
+#define DEBUG_TYPE "lgc-generate-copy-shader"
 
 using namespace lgc;
 using namespace llvm;
@@ -54,11 +54,11 @@ using namespace llvm;
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses GenerateCopyShader::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
   PipelineShadersResult &pipelineShaders = analysisManager.getResult<PipelineShaders>(module);
 
-  LLVM_DEBUG(dbgs() << "Run the pass Patch-Copy-Shader\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Generate-Copy-Shader\n");
 
   Patch::init(&module);
 
@@ -193,13 +193,17 @@ PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &an
   if (m_pipelineState->isGsOnChip())
     m_lds = Patch::getLdsVariable(m_pipelineState, entryPoint);
 
-  unsigned outputStreamCount = 0;
-  for (int i = 0; i < MaxGsStreams; ++i) {
+  SmallVector<unsigned, MaxGsStreams> activeStreams;
+  for (unsigned i = 0; i < MaxGsStreams; ++i) {
     if (m_pipelineState->isVertexStreamActive(i))
-      outputStreamCount++;
+      activeStreams.push_back(i);
   }
+  assert(!activeStreams.empty());
+
+  if (activeStreams.size() > 1) {
+    // Multiple streams
+    assert(m_pipelineState->enableXfb() || m_pipelineState->enablePrimStats());
 
-  if (outputStreamCount > 1 && m_pipelineState->enableXfb()) {
     if (!m_pipelineState->getNggControl()->enableNgg) {
       // StreamId = streamInfo[25:24]
       auto streamInfo = getFunctionArgument(entryPoint, CopyShaderEntryArgIdxStreamInfo);
@@ -232,19 +236,17 @@ PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &an
       //
 
       // Add switchInst to entry block
-      auto switchInst = builder.CreateSwitch(streamId, endBlock, outputStreamCount);
+      auto switchInst = builder.CreateSwitch(streamId, endBlock, activeStreams.size());
 
-      for (unsigned streamId = 0; streamId < MaxGsStreams; ++streamId) {
-        if (m_pipelineState->isVertexStreamActive(streamId)) {
-          std::string blockName = ".stream" + std::to_string(streamId);
-          BasicBlock *streamBlock = BasicBlock::Create(*m_context, blockName, entryPoint, endBlock);
-          builder.SetInsertPoint(streamBlock);
+      for (auto activeStream : activeStreams) {
+        std::string blockName = ".stream" + std::to_string(activeStream);
+        BasicBlock *streamBlock = BasicBlock::Create(*m_context, blockName, entryPoint, endBlock);
+        builder.SetInsertPoint(streamBlock);
 
-          switchInst->addCase(builder.getInt32(streamId), streamBlock);
+        switchInst->addCase(builder.getInt32(activeStream), streamBlock);
 
-          exportOutput(streamId, builder);
-          builder.CreateBr(endBlock);
-        }
+        exportOutput(activeStream, builder);
+        builder.CreateBr(endBlock);
       }
     } else {
       // NOTE: If NGG, the copy shader with stream-out is not a real HW VS and will be incorporated into NGG
@@ -264,15 +266,13 @@ PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &an
       //
       assert(gfxIp.major >= 11); // Must be GFX11+
 
-      for (unsigned streamId = 0; streamId < MaxGsStreams; ++streamId) {
-        if (m_pipelineState->isVertexStreamActive(streamId))
-          exportOutput(streamId, builder);
-      }
+      for (auto activeStream : activeStreams)
+        exportOutput(activeStream, builder);
       builder.CreateBr(endBlock);
     }
   } else {
-    // Just export outputs of rasterization stream
-    exportOutput(m_pipelineState->getRasterizerState().rasterStream, builder);
+    // Single stream
+    exportOutput(activeStreams[0], builder);
     builder.CreateBr(endBlock);
   }
 
@@ -283,7 +283,7 @@ PreservedAnalyses PatchCopyShader::run(Module &module, ModuleAnalysisManager &an
 // Collects info for GS generic outputs.
 //
 // @param gsEntryPoint : Geometry shader entrypoint
-void PatchCopyShader::collectGsGenericOutputInfo(Function *gsEntryPoint) {
+void GenerateCopyShader::collectGsGenericOutputInfo(Function *gsEntryPoint) {
   auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::CopyShader);
   const auto &outputLocInfoMap = resUsage->inOutUsage.outputLocInfoMap;
   std::set<InOutLocationInfo> visitedLocInfos;
@@ -351,7 +351,9 @@ void PatchCopyShader::collectGsGenericOutputInfo(Function *gsEntryPoint) {
 //
 // @param streamId : Export output of this stream
 // @param builder : BuilderBase to use for instruction constructing
-void PatchCopyShader::exportOutput(unsigned streamId, BuilderBase &builder) {
+void GenerateCopyShader::exportOutput(unsigned streamId, BuilderBase &builder) {
+  assert(streamId < MaxGsStreams);
+
   auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::CopyShader);
   auto &builtInUsage = resUsage->builtInUsage.gs;
   auto &locInfoXfbOutInfoMap = resUsage->inOutUsage.locInfoXfbOutInfoMap;
@@ -486,8 +488,8 @@ void PatchCopyShader::exportOutput(unsigned streamId, BuilderBase &builder) {
 // @param compIdx : Output component
 // @param streamId : Output stream ID
 // @param builder : BuilderBase to use for instruction constructing
-Value *PatchCopyShader::calcGsVsRingOffsetForInput(unsigned location, unsigned compIdx, unsigned streamId,
-                                                   BuilderBase &builder) {
+Value *GenerateCopyShader::calcGsVsRingOffsetForInput(unsigned location, unsigned compIdx, unsigned streamId,
+                                                      BuilderBase &builder) {
   auto entryPoint = builder.GetInsertBlock()->getParent();
   Value *vertexOffset = getFunctionArgument(entryPoint, CopyShaderEntryArgIdxVertexOffset);
 
@@ -518,8 +520,8 @@ Value *PatchCopyShader::calcGsVsRingOffsetForInput(unsigned location, unsigned c
 // @param component : Output component
 // @param streamId : Output stream ID
 // @param builder : BuilderBase to use for instruction constructing
-Value *PatchCopyShader::loadValueFromGsVsRing(Type *loadTy, unsigned location, unsigned component, unsigned streamId,
-                                              BuilderBase &builder) {
+Value *GenerateCopyShader::loadValueFromGsVsRing(Type *loadTy, unsigned location, unsigned component, unsigned streamId,
+                                                 BuilderBase &builder) {
   auto entryPoint = builder.GetInsertBlock()->getParent();
 
   unsigned elemCount = 1;
@@ -601,7 +603,7 @@ Value *PatchCopyShader::loadValueFromGsVsRing(Type *loadTy, unsigned location, u
 // @param outputValue : Value exported to output
 // @param location : Location of the output
 // @param builder : BuilderBase to use for instruction constructing
-void PatchCopyShader::exportGenericOutput(Value *outputValue, unsigned location, BuilderBase &builder) {
+void GenerateCopyShader::exportGenericOutput(Value *outputValue, unsigned location, BuilderBase &builder) {
   auto outputTy = outputValue->getType();
   assert(outputTy->isSingleValueType());
   std::string instName(lgcName::OutputExportGeneric);
@@ -615,7 +617,7 @@ void PatchCopyShader::exportGenericOutput(Value *outputValue, unsigned location,
 // @param outputValue : Value exported to output
 // @param xfbOutInfo : The reference to a transform feedback output info
 // @param builder : BuilderBase to use for instruction constructing
-void PatchCopyShader::exportXfbOutput(Value *outputValue, const XfbOutInfo &xfbOutInfo, BuilderBase &builder) {
+void GenerateCopyShader::exportXfbOutput(Value *outputValue, const XfbOutInfo &xfbOutInfo, BuilderBase &builder) {
   if (xfbOutInfo.is16bit) {
     // NOTE: For 16-bit transform feedback output, the value is 32-bit dword loaded from GS-VS ring
     // buffer. The high word is always zero while the low word contains the data value. We have to
@@ -657,8 +659,8 @@ void PatchCopyShader::exportXfbOutput(Value *outputValue, const XfbOutInfo &xfbO
 // @param builtInId : ID of the built-in variable
 // @param streamId : ID of output vertex stream
 // @param builder : BuilderBase to use for instruction constructing
-void PatchCopyShader::exportBuiltInOutput(Value *outputValue, BuiltInKind builtInId, unsigned streamId,
-                                          BuilderBase &builder) {
+void GenerateCopyShader::exportBuiltInOutput(Value *outputValue, BuiltInKind builtInId, unsigned streamId,
+                                             BuilderBase &builder) {
   auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::CopyShader);
 
   if (m_pipelineState->enableXfb()) {
diff --git a/lgc/patch/PatchLlvmIrInclusion.cpp b/lgc/patch/IncludeLlvmIr.cpp
similarity index 97%
rename from lgc/patch/PatchLlvmIrInclusion.cpp
rename to lgc/patch/IncludeLlvmIr.cpp
index 3ff1c7f387..9553f03706 100644
--- a/lgc/patch/PatchLlvmIrInclusion.cpp
+++ b/lgc/patch/IncludeLlvmIr.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchLlvmIrInclusion.cpp
+ * @file  IncludeLlvmIr.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchLlvmIrInclusion.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchLlvmIrInclusion.h"
+#include "lgc/patch/IncludeLlvmIr.h"
 #include "lgc/state/Abi.h"
 #include "lgc/state/PipelineState.h"
 #include "llvm/IR/Constants.h"
diff --git a/lgc/patch/LowerCooperativeMatrix.cpp b/lgc/patch/LowerCooperativeMatrix.cpp
index ad7b274447..bec5cec5d7 100644
--- a/lgc/patch/LowerCooperativeMatrix.cpp
+++ b/lgc/patch/LowerCooperativeMatrix.cpp
@@ -48,6 +48,37 @@ using namespace lgc;
 
 namespace lgc {
 
+static const Intrinsic::AMDGCNIntrinsics InvalidInstricID = Intrinsic::AMDGCNIntrinsics(0xFFFFFFFF);
+static const Intrinsic::AMDGCNIntrinsics GetWmmaIntrinsic(GfxIpVersion gfxIp, CooperativeMatrixElementType typeA,
+                                                          CooperativeMatrixElementType typeB,
+                                                          CooperativeMatrixElementType typeC, bool isTiled = false) {
+  assert(gfxIp.major >= 11);
+  switch (typeA) {
+  case CooperativeMatrixElementType::Float16: {
+    assert(typeA == typeB);
+    if (typeC == CooperativeMatrixElementType::Float16)
+      return isTiled ? Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied : Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
+    if (typeC == CooperativeMatrixElementType::Float32)
+      return Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
+  }
+  case CooperativeMatrixElementType::BFloat16: {
+    assert(typeA == typeB);
+    if (typeC == CooperativeMatrixElementType::BFloat16)
+      return isTiled ? Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied : Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
+    if (typeC == CooperativeMatrixElementType::Float32)
+      return Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
+  }
+  case CooperativeMatrixElementType::Int8: {
+    if (typeC == CooperativeMatrixElementType::Int32)
+      return Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
+  }
+  default:
+    break;
+  }
+
+  return InvalidInstricID;
+}
+
 // =====================================================================================================================
 // Run the patch cooperative matrix pass on a module
 //
@@ -130,6 +161,8 @@ LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties
     props.numMatrixWords = 8;
     break;
   case CooperativeMatrixElementType::Int8:
+  case CooperativeMatrixElementType::Float8:
+  case CooperativeMatrixElementType::BFloat8:
     props.numMatrixElements = 16;
     props.numMatrixWords = 4;
     break;
@@ -137,7 +170,7 @@ LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties
     llvm_unreachable("unknown element type");
   }
 
-  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
+  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
   if (layout == CooperativeMatrixLayout::FactorMatrixLayout) {
     assert(elemType != CooperativeMatrixElementType::Float32 && elemType != CooperativeMatrixElementType::Int32);
     props.numFlatElements = 16;
@@ -273,7 +306,7 @@ LowerCooperativeMatrix::computeAddressing(CooperativeMatrixLayout layout, Cooper
 void LowerCooperativeMatrix::visitCooperativeMatrixLengthOp(CooperativeMatrixLengthOp &matrixlength) {
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(&matrixlength);
-  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
+  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
   auto layout = matrixlength.getLayout();
   unsigned length = 0;
   switch (layout) {
@@ -466,7 +499,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixExtractOp(CooperativeMatrixEx
   // a specialization constant even though, at the time of specialization constant lowering, we don't yet know the
   // wave size. We should remove this once a healther KHR extension has been released.
   if (layout == CooperativeMatrixLayout::AccumulatorMatrixLayout &&
-      m_pipelineState->getShaderWaveSize(m_shaderStage) == 64) {
+      m_pipelineState->getShaderWaveSize(m_shaderStage.value()) == 64) {
     unsigned length = cast<FixedVectorType>(vec->getType())->getNumElements();
     index = builder.CreateAnd(index, builder.getInt32(length - 1));
   }
@@ -495,7 +528,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixInsertOp(CooperativeMatrixIns
   // a specialization constant even though, at the time of specialization constant lowering, we don't yet know the
   // wave size. We should remove this once a healther KHR extension has been released.
   if (layout == CooperativeMatrixLayout::AccumulatorMatrixLayout &&
-      m_pipelineState->getShaderWaveSize(m_shaderStage) == 64) {
+      m_pipelineState->getShaderWaveSize(m_shaderStage.value()) == 64) {
     unsigned length = cast<FixedVectorType>(vec->getType())->getNumElements();
     Value *outOfBounds = builder.CreateICmpUGE(index, builder.getInt32(length));
     index = builder.CreateAnd(index, builder.getInt32(length - 1));
@@ -557,12 +590,11 @@ Value *LowerCooperativeMatrix::cooperativeMatrixConvertInternal(CastInst::CastOp
     resultValue = builder.CreateCast(Instruction::FPExt, source, FixedVectorType::get(builder.getFloatTy(), vecSize),
                                      "Convert16tofloat32");
     resultValue = builder.CreateFPTrunc(resultValue, dstType);
-  } else {
+  } else
     resultValue = builder.CreateCast(castOp, source, dstType, "castOpConvert");
-  }
 
   if (dstElemType == CooperativeMatrixElementType::BFloat16) {
-    resultValue = builder.CreateBitCast(resultValue, FixedVectorType::get(builder.getInt16Ty(), vecSize));
+    return builder.CreateBitCast(resultValue, FixedVectorType::get(builder.getInt16Ty(), vecSize));
   }
 
   return resultValue;
@@ -731,7 +763,7 @@ Value *LowerCooperativeMatrix::cooperativeMatrixReshape16BitElementGfx1011(
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(insertPos);
   Value *resultValue = nullptr;
-  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
+  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
   Value *laneGroupIdx = builder.CreateUDiv(threadId, builder.getInt32(16));
   Value *isEvenGroup = builder.CreateICmpEQ(builder.CreateAnd(laneGroupIdx, builder.getInt32(1)), builder.getInt32(0));
 
@@ -919,7 +951,7 @@ Value *LowerCooperativeMatrix::cooperativeMatrixReshapeBetween8bitAnd32bitElemen
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(insertPos);
   Value *resultValue = nullptr;
-  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
+  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
   Value *threadId = getLaneNumber(builder);
   Value *laneGroupIdx = builder.CreateUDiv(threadId, builder.getInt32(16));
   Value *isEvenGroup = builder.CreateICmpEQ(builder.CreateAnd(laneGroupIdx, builder.getInt32(1)), builder.getInt32(0));
@@ -1386,50 +1418,53 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
   Value *matrixA = muladd.getMatrixA();
   Value *matrixB = muladd.getMatrixB();
   Value *matrixC = muladd.getMatrixC();
-  auto factorElemType = muladd.getFactorElemType();
-  auto accumElemType = muladd.getAccuElemType();
+  auto matrixAType = muladd.getMatrixAElemType();
+  auto matrixBType = muladd.getMatrixBElemType();
+  auto matrixCType = muladd.getMatrixCElemType();
   bool isSignedA = muladd.getIsSignedA();
   bool isSignedB = muladd.getIsSignedB();
   bool isSatOrOpsel = muladd.getIsSatOrOpsel();
   StringRef instName = muladd.getName();
 
+  // Gfx11:
+  // wave64:
+  // declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
+  // declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
+  // declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+  // declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
+  // declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1
+  // immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x
+  // i32>, i1 immarg)
+  // wave32:
+  // declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
+  // declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
+  // declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+  // declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
+  // declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1
+  // immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x
+  // i32>, i1 immarg)
+
   if (m_gfxIp.major >= 11) {
-    // Gfx11:
-    // wave64:
-    // declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
-    // declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
-    // declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
-    // declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
-    // declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1
-    // immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x
-    // i32>, i1 immarg)
-    // wave32:
-    // declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
-    // declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
-    // declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
-    // declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
-    // declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1
-    // immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x
-    // i32>, i1 immarg)
     Value *matrixD;
-    unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
+    unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
 
-    if (BuilderCommon::isTypeNCooperativeMatrix(factorElemType, 16)) {
+    if (BuilderCommon::isTypeNCooperativeMatrix(matrixAType, 16)) {
+      assert(matrixAType == matrixBType);
       unsigned factorFlatElemNum = 0;
       { factorFlatElemNum = 16; }
       Type *factorType =
-          FixedVectorType::get(builder.transCooperativeMatrixElementType(factorElemType), factorFlatElemNum);
+          FixedVectorType::get(builder.transCooperativeMatrixElementType(matrixAType), factorFlatElemNum);
       matrixA = builder.CreateBitCast(matrixA, factorType);
       matrixB = builder.CreateBitCast(matrixB, factorType);
-    } else if (BuilderCommon::isTypeNCooperativeMatrix(factorElemType, 8)) {
+    } else if (BuilderCommon::isTypeNCooperativeMatrix(matrixAType, 8)) {
     } else {
       llvm_unreachable("Factor element type is not supported!");
     }
 
-    if (BuilderCommon::isTypeNCooperativeMatrix(accumElemType, 32)) {
+    if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 32)) {
       matrixC =
           waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1, 2, 3}), "shuffleVector") : matrixC;
-    } else if (BuilderCommon::isTypeNCooperativeMatrix(accumElemType, 16)) {
+    } else if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 16)) {
       {
         matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1, 2, 3}), "shuffleVector")
                                  : matrixC;
@@ -1437,7 +1472,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
       unsigned matrixLength = cast<FixedVectorType>(matrixC->getType())->getNumElements();
 
       Type *castType = nullptr;
-      if (accumElemType == CooperativeMatrixElementType::BFloat16) {
+      if (matrixCType == CooperativeMatrixElementType::BFloat16) {
         // HW instructions require i16 type for bfloat16.
         castType = builder.getInt16Ty();
       } else
@@ -1448,51 +1483,44 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
       llvm_unreachable("Accumulator element type is not supported!");
     }
 
-    if (factorElemType == CooperativeMatrixElementType::BFloat16) {
-      Intrinsic::AMDGCNIntrinsics intrinsic = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
-      SmallVector<Value *, 3> args({matrixA, matrixB, matrixC});
-      if (accumElemType == CooperativeMatrixElementType::Float32)
-        intrinsic = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
-      else {
-        assert(accumElemType == CooperativeMatrixElementType::BFloat16);
-        args.push_back(builder.getInt1(isSatOrOpsel));
-        if (muladd.getIsTied())
-          intrinsic = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
-      }
-      matrixD = builder.CreateIntrinsic(matrixC->getType(), intrinsic, args, nullptr, instName);
-    } else if (factorElemType == CooperativeMatrixElementType::Float16 &&
-               accumElemType == CooperativeMatrixElementType::Float32) {
-      matrixD = builder.CreateIntrinsic(matrixC->getType(), Intrinsic::amdgcn_wmma_f32_16x16x16_f16,
-                                        {matrixA, matrixB, matrixC}, nullptr, instName);
-
-    } else if (factorElemType == CooperativeMatrixElementType::Int8 &&
-               accumElemType == CooperativeMatrixElementType::Int32) {
-      matrixD = builder.CreateIntrinsic(matrixC->getType(), Intrinsic::amdgcn_wmma_i32_16x16x16_iu8,
-                                        {builder.getInt1(isSignedA), matrixA, builder.getInt1(isSignedB), matrixB,
-                                         matrixC, builder.getInt1(isSatOrOpsel)},
-                                        nullptr, instName);
-
-    } else if (factorElemType == CooperativeMatrixElementType::Float16 &&
-               accumElemType == CooperativeMatrixElementType::Float16) {
-      // Matrix convert to match intrinsic arguments: Wave32: float32*v8->half*v16
-      // Wave64: float32*v4->half*v8
-      bool isTied = muladd.getIsTied();
-      auto intrinsic = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
-      if (isTied)
-#if defined(LLVM_MAIN_REVISION) && LLVM_MAIN_REVISION < 479080
-        llvm_unreachable("Tied intrinsics not implemented");
-#else
-        intrinsic = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
-#endif
-      matrixD = builder.CreateIntrinsic(matrixC->getType(), intrinsic,
-                                        {matrixA, matrixB, matrixC, builder.getInt1(isSatOrOpsel)}, nullptr, instName);
-    } else {
-      llvm_unreachable("The accumulator type is not supported.");
+    auto intrinsic = GetWmmaIntrinsic(m_gfxIp, matrixAType, matrixBType, matrixCType, muladd.getIsTied());
+    if (intrinsic == InvalidInstricID)
+      llvm_unreachable("HW intrinsics not supported!");
+
+    SmallVector<Value *, 3> args;
+    switch (intrinsic) {
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
+      args.push_back(matrixA);
+      args.push_back(matrixB);
+      args.push_back(matrixC);
+      break;
+    case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
+    case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
+    case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
+    case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
+      args.push_back(matrixA);
+      args.push_back(matrixB);
+      args.push_back(matrixC);
+      args.push_back(builder.getInt1(isSatOrOpsel));
+      break;
+    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
+      args.push_back(builder.getInt1(isSignedA));
+      args.push_back(matrixA);
+      args.push_back(builder.getInt1(isSignedB));
+      args.push_back(matrixB);
+      args.push_back(matrixC);
+      args.push_back(builder.getInt1(isSatOrOpsel));
+      break;
+    default:
+      llvm_unreachable("Should never be called!");
+      break;
     }
+    matrixD = builder.CreateIntrinsic(matrixC->getType(), intrinsic, args, nullptr, instName);
 
-    if (BuilderCommon::isTypeNCooperativeMatrix(accumElemType, 16)) {
+    if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 16)) {
       unsigned coopVeclength = cast<FixedVectorType>(matrixD->getType())->getNumElements();
-      Type *wordTy = builder.transCooperativeMatrixElementType(accumElemType)->isIntOrIntVectorTy()
+      Type *wordTy = builder.transCooperativeMatrixElementType(matrixCType)->isIntOrIntVectorTy()
                          ? builder.getInt32Ty()
                          : builder.getFloatTy();
       matrixD = builder.CreateBitCast(matrixD, FixedVectorType::get(wordTy, coopVeclength / 2));
@@ -1512,8 +1540,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
   }
 
   // Emulator on NAVI2X
-  Type *packedTy =
-      (factorElemType == CooperativeMatrixElementType::Float16) ? builder.getFloatTy() : builder.getInt32Ty();
+  Type *packedTy = (matrixAType == CooperativeMatrixElementType::Float16) ? builder.getFloatTy() : builder.getInt32Ty();
   Value *dotProductValue;
 
   Value *threadId = getLaneNumber(builder);
@@ -1529,7 +1556,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
   };
 
   // matrixC is not reshaped for gfx10
-  if (accumElemType == CooperativeMatrixElementType::Float32 || accumElemType == CooperativeMatrixElementType::Int32) {
+  if (matrixCType == CooperativeMatrixElementType::Float32 || matrixCType == CooperativeMatrixElementType::Int32) {
     dotProductValue = PoisonValue::get(FixedVectorType::get(packedTy, 8));
     for (unsigned idxc = 0; idxc < 8; ++idxc) {
       Value *rowlowgroup = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc * 2));
@@ -1537,25 +1564,24 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
       Value *rowData = builder.CreateSelect(isEvenGroup, rowlowgroup, rowhighgroup);
       Value *mulAB;
       Value *initAccumulator = builder.CreateExtractElement(matrixC, idxc);
-      if (factorElemType == CooperativeMatrixElementType::Float16) {
+      if (matrixAType == CooperativeMatrixElementType::Float16) {
         mulAB = createDotProductFp16Fp32(rowData, matrixB, initAccumulator, isSatOrOpsel, instName, &muladd);
-      } else if (factorElemType == CooperativeMatrixElementType::Int16) {
+      } else if (matrixAType == CooperativeMatrixElementType::Int16) {
         mulAB = createDotProductInt16Int32(rowData, matrixB, initAccumulator, flags, isSatOrOpsel, instName, &muladd);
-      } else if (factorElemType == CooperativeMatrixElementType::Int8) {
+      } else if (matrixAType == CooperativeMatrixElementType::Int8) {
         mulAB = createDotProductInt8Int32(rowData, matrixB, initAccumulator, flags, isSatOrOpsel, instName, &muladd);
       } else {
         llvm_unreachable("Unsupported element type!");
       }
       dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB, idxc);
     }
-  } else if (accumElemType == CooperativeMatrixElementType::Int16 ||
-             accumElemType == CooperativeMatrixElementType::Float16) {
-    dotProductValue =
-        PoisonValue::get(FixedVectorType::get(builder.transCooperativeMatrixElementType(accumElemType), 8));
+  } else if (matrixCType == CooperativeMatrixElementType::Int16 ||
+             matrixCType == CooperativeMatrixElementType::Float16) {
+    dotProductValue = PoisonValue::get(FixedVectorType::get(builder.transCooperativeMatrixElementType(matrixCType), 8));
     // For gfx10, A*B:8*float32->16*half  C: no reshape for 16bit, still 16*half
     Value *colData =
-        convCoopMatrixVecToFlatVec(builder, matrixB, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout);
-    matrixC = convCoopMatrixVecToFlatVec(builder, matrixC, accumElemType,
+        convCoopMatrixVecToFlatVec(builder, matrixB, matrixAType, CooperativeMatrixLayout::FactorMatrixLayout);
+    matrixC = convCoopMatrixVecToFlatVec(builder, matrixC, matrixCType,
                                          CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout);
 
     for (unsigned idxc = 0, accIdx = 0; idxc < 16; idxc += 4, accIdx += 2) {
@@ -1568,16 +1594,16 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
       Value *rowData2 = builder.CreateSelect(isEvenGroup, rowData2Low, rowData2High);
 
       rowData1 =
-          convCoopMatrixVecToFlatVec(builder, rowData1, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout);
+          convCoopMatrixVecToFlatVec(builder, rowData1, matrixAType, CooperativeMatrixLayout::FactorMatrixLayout);
       rowData2 =
-          convCoopMatrixVecToFlatVec(builder, rowData2, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout);
+          convCoopMatrixVecToFlatVec(builder, rowData2, matrixAType, CooperativeMatrixLayout::FactorMatrixLayout);
 
       Value *mulAB1;
       Value *mulAB2;
       Value *accumulator1 = builder.CreateExtractElement(matrixC, accIdx);
       Value *accumulator2 = builder.CreateExtractElement(matrixC, accIdx + 1);
 
-      if (accumElemType == CooperativeMatrixElementType::Float16) {
+      if (matrixCType == CooperativeMatrixElementType::Float16) {
         mulAB1 = createDotProductFp16Fp16(rowData1, colData, accumulator1, isSatOrOpsel, instName, &muladd);
         mulAB2 = createDotProductFp16Fp16(rowData2, colData, accumulator2, isSatOrOpsel, instName, &muladd);
       } else {
@@ -1588,7 +1614,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
       dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB2, accIdx + 1);
     }
 
-    dotProductValue = convFlatVecToCoopMatrixVec(builder, dotProductValue, accumElemType,
+    dotProductValue = convFlatVecToCoopMatrixVec(builder, dotProductValue, matrixCType,
                                                  CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout);
   } else {
     llvm_unreachable("The accumulator type is not supported.");
@@ -1884,7 +1910,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixUnPackOp(CooperativeMatrixUnP
 // @param builder : The IR builder to create and insert IR instruction
 Value *LowerCooperativeMatrix::getLaneNumber(BuilderBase &builder) {
   Value *result = builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {builder.getInt32(-1), builder.getInt32(0)});
-  if (m_pipelineState->getShaderWaveSize(m_shaderStage) == 64)
+  if (m_pipelineState->getShaderWaveSize(m_shaderStage.value()) == 64)
     result = builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {builder.getInt32(-1), result});
   return result;
 }
@@ -2082,7 +2108,7 @@ void LowerCooperativeMatrix::visitCooperativeRowAccExpandOp(CooperativeRowAccExp
           {mappedArgs[0], passthroughArgs[0], passthroughArgs[1], passthroughArgs[2], passthroughArgs[3]});
     };
 
-    auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
+    auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
     assert(waveSize == 32 || waveSize == 64);
 
     DppCtrl shuffleCtrl[4] = {DppCtrl(UINT32_MAX), DppCtrl(UINT32_MAX), DppCtrl(UINT32_MAX), DppCtrl(UINT32_MAX)};
diff --git a/lgc/patch/LowerDebugPrintf.cpp b/lgc/patch/LowerDebugPrintf.cpp
index 98bfe7a2d4..8f9e0d7e97 100644
--- a/lgc/patch/LowerDebugPrintf.cpp
+++ b/lgc/patch/LowerDebugPrintf.cpp
@@ -75,8 +75,10 @@ PreservedAnalyses LowerDebugPrintf::run(Module &module, ModuleAnalysisManager &a
     return PreservedAnalyses::all();
 
   const ResourceNode *node = nullptr;
-  std::tie(m_topNode, node) = pipelineState->findResourceNode(ResourceNodeType::DescriptorBuffer,
-                                                              InternalDescriptorSetId, PrintfBufferBindingId);
+  // LLpc node type is DescriptorBuffer
+  // So use ResourceNodeType::Unknown to match different node type.
+  std::tie(m_topNode, node) =
+      pipelineState->findResourceNode(ResourceNodeType::Unknown, InternalDescriptorSetId, PrintfBufferBindingId);
 
   static const auto lowerDebugfPrintOpVisitor = llvm_dialects::VisitorBuilder<LowerDebugPrintf>()
                                                     .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
diff --git a/lgc/patch/LowerGpuRt.cpp b/lgc/patch/LowerGpuRt.cpp
index 33a65e3fbd..fcc2eff4c3 100644
--- a/lgc/patch/LowerGpuRt.cpp
+++ b/lgc/patch/LowerGpuRt.cpp
@@ -59,8 +59,7 @@ PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysi
 
   Builder builderImpl(pipelineState->getContext());
   m_builder = &builderImpl;
-
-  createGlobalStack(module);
+  createLdsStack(module);
 
   static auto visitor = llvm_dialects::VisitorBuilder<LowerGpuRt>()
                             .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
@@ -104,20 +103,25 @@ PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysi
 
 // =====================================================================================================================
 // Get pipeline workgroup size for stack size calculation
-unsigned LowerGpuRt::getWorkgroupSize() const {
+// @param [in] Function : The function to retrieve shader information
+unsigned LowerGpuRt::getWorkgroupSize(Function *func) const {
   unsigned workgroupSize = 0;
-  if (m_pipelineState->isGraphics()) {
-    // Force 64 for graphics stages
-    workgroupSize = 64;
-  } else {
+  auto stage = getShaderStage(func);
+  const unsigned waveSize = m_pipelineState->getShaderWaveSize(stage.value());
+  if (stage == ShaderStage::Mesh) {
+    auto &meshMode = m_pipelineState->getShaderModes()->getMeshShaderMode();
+    workgroupSize = meshMode.workgroupSizeX * meshMode.workgroupSizeY * meshMode.workgroupSizeZ;
+  } else if (stage == ShaderStage::Task || stage == ShaderStage::Compute) {
     ComputeShaderMode mode = m_pipelineState->getShaderModes()->getComputeShaderMode();
     workgroupSize = mode.workgroupSizeX * mode.workgroupSizeY * mode.workgroupSizeZ;
+  } else {
+    assert(m_pipelineState->isGraphics());
+    workgroupSize = 64;
   }
   assert(workgroupSize != 0);
-  if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) {
-    // Round up to multiple of 32, as the ds_bvh_stack swizzle as 32 threads
-    workgroupSize = alignTo(workgroupSize, 32);
-  }
+
+  workgroupSize = alignTo(workgroupSize, waveSize);
+
   return workgroupSize;
 }
 
@@ -136,33 +140,47 @@ Value *LowerGpuRt::getThreadIdInGroup() const {
 }
 
 // =====================================================================================================================
-// Create global variable for the stack
+// Update the workgroup size from different functions
+// @param func : Function to get WorkgroupSize from
+void LowerGpuRt::updateWorkgroupSize(Function *func) {
+  unsigned funcWorkSize = getWorkgroupSize(func);
+  m_workGroupSize = m_workGroupSize > funcWorkSize ? m_workGroupSize : funcWorkSize;
+}
+
+// =====================================================================================================================
+// Create global variable for the lds stack
 // @param [in/out] module : LLVM module to be run on
-void LowerGpuRt::createGlobalStack(Module &module) {
+void LowerGpuRt::createLdsStack(Module &module) {
   struct Payload {
-    bool needGlobalStack;
+    bool needLdsStack;
     bool needExtraStack;
+    LowerGpuRt *lowerRt;
   };
-  Payload payload = {false, false};
+  Payload payload = {false, false, this};
+  m_workGroupSize = 0;
   static auto visitor = llvm_dialects::VisitorBuilder<Payload>()
                             .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
                             .add<GpurtStackWriteOp>([](auto &payload, auto &op) {
-                              payload.needGlobalStack = true;
+                              payload.needLdsStack = true;
                               payload.needExtraStack |= op.getUseExtraStack();
+                              payload.lowerRt->updateWorkgroupSize(op.getFunction());
                             })
                             .add<GpurtStackReadOp>([](auto &payload, auto &op) {
-                              payload.needGlobalStack = true;
+                              payload.needLdsStack = true;
                               payload.needExtraStack |= op.getUseExtraStack();
+                              payload.lowerRt->updateWorkgroupSize(op.getFunction());
                             })
                             .add<GpurtLdsStackInitOp>([](auto &payload, auto &op) {
-                              payload.needGlobalStack = true;
+                              payload.needLdsStack = true;
                               payload.needExtraStack |= op.getUseExtraStack();
+                              payload.lowerRt->updateWorkgroupSize(op.getFunction());
                             })
                             .build();
   visitor.visit(payload, module);
 
-  if (payload.needGlobalStack) {
-    auto ldsStackSize = getWorkgroupSize() * MaxLdsStackEntries;
+  if (payload.needLdsStack) {
+    assert(m_workGroupSize > 0);
+    auto ldsStackSize = m_workGroupSize * MaxLdsStackEntries;
     // Double LDS size when any operations requires to perform on extra stack.
     if (payload.needExtraStack)
       ldsStackSize = ldsStackSize << 1;
@@ -183,7 +201,7 @@ void LowerGpuRt::createGlobalStack(Module &module) {
 void LowerGpuRt::visitGetStackSize(GpurtGetStackSizeOp &inst) {
   m_builder->SetInsertPoint(&inst);
   Value *size = nullptr;
-  size = m_builder->getInt32(MaxLdsStackEntries * getWorkgroupSize());
+  size = m_builder->getInt32(MaxLdsStackEntries * m_workGroupSize);
   inst.replaceAllUsesWith(size);
   m_callsToLower.push_back(&inst);
   m_funcsToLower.insert(inst.getCalledFunction());
@@ -207,7 +225,7 @@ void LowerGpuRt::visitGetStackBase(GpurtGetStackBaseOp &inst) {
 // @param inst : The dialect instruction to process
 void LowerGpuRt::visitGetStackStride(GpurtGetStackStrideOp &inst) {
   m_builder->SetInsertPoint(&inst);
-  Value *stride = m_builder->getInt32(getWorkgroupSize());
+  Value *stride = m_builder->getInt32(m_workGroupSize);
   inst.replaceAllUsesWith(stride);
   m_callsToLower.push_back(&inst);
   m_funcsToLower.insert(inst.getCalledFunction());
@@ -222,7 +240,7 @@ void LowerGpuRt::visitStackRead(GpurtStackReadOp &inst) {
   Value *stackIndex = inst.getIndex();
   Type *stackTy = PointerType::get(m_builder->getInt32Ty(), 3);
   if (inst.getUseExtraStack()) {
-    auto ldsStackSize = m_builder->getInt32(getWorkgroupSize() * MaxLdsStackEntries);
+    auto ldsStackSize = m_builder->getInt32(m_workGroupSize * MaxLdsStackEntries);
     stackIndex = m_builder->CreateAdd(stackIndex, ldsStackSize);
   }
 
@@ -244,7 +262,7 @@ void LowerGpuRt::visitStackWrite(GpurtStackWriteOp &inst) {
   Value *stackData = inst.getValue();
   Type *stackTy = PointerType::get(m_builder->getInt32Ty(), 3);
   if (inst.getUseExtraStack()) {
-    auto ldsStackSize = m_builder->getInt32(getWorkgroupSize() * MaxLdsStackEntries);
+    auto ldsStackSize = m_builder->getInt32(m_workGroupSize * MaxLdsStackEntries);
     stackIndex = m_builder->CreateAdd(stackIndex, ldsStackSize);
   }
 
@@ -266,7 +284,7 @@ void LowerGpuRt::visitLdsStackInit(GpurtLdsStackInitOp &inst) {
 
   // From Navi3x on, Hardware has decided that the stacks are only swizzled across every 32 threads,
   // with stacks for every set of 32 threads stored after all the stack data for the previous 32 threads.
-  if (getWorkgroupSize() > 32) {
+  if (m_workGroupSize > 32) {
     // localThreadId = (LinearLocalThreadID%32)
     // localGroupId = (LinearLocalThreadID/32)
     // stackSize = STACK_SIZE * 32 = m_stackEntries * 32
@@ -281,7 +299,7 @@ void LowerGpuRt::visitLdsStackInit(GpurtLdsStackInitOp &inst) {
   }
 
   if (inst.getUseExtraStack()) {
-    auto ldsStackSize = m_builder->getInt32(getWorkgroupSize() * MaxLdsStackEntries);
+    auto ldsStackSize = m_builder->getInt32(m_workGroupSize * MaxLdsStackEntries);
     stackBasePerThread = m_builder->CreateAdd(stackBasePerThread, ldsStackSize);
   }
 
diff --git a/lgc/patch/PatchImageDerivatives.cpp b/lgc/patch/LowerImageDerivatives.cpp
similarity index 95%
rename from lgc/patch/PatchImageDerivatives.cpp
rename to lgc/patch/LowerImageDerivatives.cpp
index e7798d7acd..933b178113 100644
--- a/lgc/patch/PatchImageDerivatives.cpp
+++ b/lgc/patch/LowerImageDerivatives.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchImageDerivatives.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchImageDerivatives.
+ * @file  LowerImageDerivatives.cpp
+ * @brief LLPC source file: contains implementation of class lgc::LowerImageDerivatives.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchImageDerivatives.h"
+#include "lgc/patch/LowerImageDerivatives.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/state/PipelineState.h"
 #include "llvm/ADT/SmallSet.h"
@@ -37,7 +37,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 
-#define DEBUG_TYPE "lgc-patch-image-derivatives"
+#define DEBUG_TYPE "lgc-lower-image-derivatives"
 
 using namespace llvm;
 using namespace lgc;
@@ -58,10 +58,10 @@ static bool usesImplicitDerivatives(StringRef name) {
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchImageDerivatives::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses LowerImageDerivatives::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
 
-  LLVM_DEBUG(dbgs() << "Run the pass Patch-Image-Derivatives\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Image-Derivatives\n");
 
   if (!pipelineState->hasShaderStage(ShaderStage::Fragment))
     return PreservedAnalyses::all();
diff --git a/lgc/patch/PatchInOutImportExport.cpp b/lgc/patch/LowerInOut.cpp
similarity index 98%
rename from lgc/patch/PatchInOutImportExport.cpp
rename to lgc/patch/LowerInOut.cpp
index 2040857824..fdff15e302 100644
--- a/lgc/patch/PatchInOutImportExport.cpp
+++ b/lgc/patch/LowerInOut.cpp
@@ -24,12 +24,12 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchInOutImportExport.cpp
+ * @file  LowerInOut.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchInOutImportExport.
  *
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchInOutImportExport.h"
+#include "lgc/patch/LowerInOut.h"
 #include "lgc/Builder.h"
 #include "lgc/BuiltIns.h"
 #include "lgc/LgcDialect.h"
@@ -141,11 +141,10 @@ PreservedAnalyses PatchInOutImportExport::run(Module &module, ModuleAnalysisMana
 
   // Process each shader in turn, in reverse order (because for example VS uses inOutUsage.tcs.calcFactor
   // set by TCS).
-  for (int shaderStage = ShaderStage::CountInternal - 1; shaderStage >= 0; --shaderStage) {
-    auto entryPoint = pipelineShaders.getEntryPoint(static_cast<ShaderStageEnum>(shaderStage));
+  for (auto stage : llvm::reverse(ShaderStagesNativeCopy)) {
+    auto entryPoint = pipelineShaders.getEntryPoint(stage);
     if (entryPoint) {
-      processFunction(*entryPoint, static_cast<ShaderStageEnum>(shaderStage), inputCallees, otherCallees,
-                      getPostDominatorTree);
+      processFunction(*entryPoint, stage, inputCallees, otherCallees, getPostDominatorTree);
     }
   }
 
@@ -246,8 +245,8 @@ void PatchInOutImportExport::markExportDone(Function *func, PostDominatorTree &p
 // Process a single shader
 void PatchInOutImportExport::processShader() {
   // Initialize the output value for gl_PrimitiveID
-  const auto &builtInUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->builtInUsage;
-  const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs;
+  const auto &builtInUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->builtInUsage;
+  const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs;
   if (m_shaderStage == ShaderStage::Vertex) {
     if (builtInUsage.vs.primitiveId)
       m_primitiveId = getFunctionArgument(m_entryPoint, entryArgIdxs.vs.primitiveId);
@@ -453,7 +452,7 @@ void PatchInOutImportExport::processShader() {
         unsigned workgroupSizeX = mode.workgroupSizeX;
         unsigned workgroupSizeY = mode.workgroupSizeY;
         unsigned workgroupSizeZ = mode.workgroupSizeZ;
-        SwizzleWorkgroupLayout layout = calculateWorkgroupLayout(m_pipelineState, m_shaderStage);
+        SwizzleWorkgroupLayout layout = calculateWorkgroupLayout(m_pipelineState, m_shaderStage.value());
         while (!func.use_empty()) {
           CallInst *reconfigCall = cast<CallInst>(*func.user_begin());
           Value *localInvocationId = reconfigCall->getArgOperand(0);
@@ -463,7 +462,7 @@ void PatchInOutImportExport::processShader() {
                 (layout.macroLayout == WorkgroupLayout::SexagintiQuads)) {
               BuilderBase builder(reconfigCall);
               localInvocationId = reconfigWorkgroupLayout(
-                  localInvocationId, m_pipelineState, m_shaderStage, layout.macroLayout, layout.microLayout,
+                  localInvocationId, m_pipelineState, m_shaderStage.value(), layout.macroLayout, layout.microLayout,
                   workgroupSizeX, workgroupSizeY, workgroupSizeZ, isHwLocalInvocationId, builder);
             }
           }
@@ -514,7 +513,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(&callInst);
 
-  auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage);
+  auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value());
 
   auto mangledName = callee->getName();
 
@@ -556,7 +555,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
 
       LLVM_DEBUG(dbgs() << "Find input import call: builtin = " << builtInId << "\n");
 
-      switch (m_shaderStage) {
+      switch (m_shaderStage.value()) {
       case ShaderStage::Vertex:
         // Nothing to do
         break;
@@ -638,7 +637,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
       origLocInfo.setLocation(origLoc);
       if (m_shaderStage == ShaderStage::TessEval ||
           (m_shaderStage == ShaderStage::Fragment &&
-           (m_pipelineState->getPrevShaderStage(m_shaderStage) == ShaderStage::Mesh ||
+           (m_pipelineState->getPrevShaderStage(m_shaderStage.value()) == ShaderStage::Mesh ||
             m_pipelineState->isUnlinked()))) {
         // NOTE: For generic inputs of tessellation evaluation shader or fragment shader whose previous shader stage
         // is mesh shader or is in unlinked pipeline, they could be per-patch ones or per-primitive ones.
@@ -664,7 +663,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
           }
         }
       } else {
-        if (m_pipelineState->canPackInput(m_shaderStage)) {
+        if (m_pipelineState->canPackInput(m_shaderStage.value())) {
           // The inputLocInfoMap of {TCS, GS, FS} maps original InOutLocationInfo to tightly compact InOutLocationInfo
           const bool isTcs = m_shaderStage == ShaderStage::TessControl;
           (void)isTcs;
@@ -700,7 +699,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
         elemIdx = genericLocationOp.getElemIdx();
       assert(isDontCareValue(elemIdx) == false);
 
-      switch (m_shaderStage) {
+      switch (m_shaderStage.value()) {
       case ShaderStage::TessControl: {
         auto &inputOp = cast<InputImportGenericOp>(genericLocationOp);
         auto vertexIdx = inputOp.getArrayIndex();
@@ -833,7 +832,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
       unsigned streamId = cast<ConstantInt>(callInst.getOperand(2))->getZExtValue();
 
       // NOTE: Transform feedback output will be done in last vertex-processing shader stage.
-      switch (m_shaderStage) {
+      switch (m_shaderStage.value()) {
       case ShaderStage::Vertex: {
         // No TS/GS pipeline, VS is the last stage
         if (!m_hasGs && !m_hasTs)
@@ -863,7 +862,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
     } else if (isBuiltInOutputExport) {
       const unsigned builtInId = value;
 
-      switch (m_shaderStage) {
+      switch (m_shaderStage.value()) {
       case ShaderStage::Vertex: {
         patchVsBuiltInOutputExport(output, builtInId, builder);
         break;
@@ -881,7 +880,8 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
         break;
       }
       case ShaderStage::Geometry: {
-        patchGsBuiltInOutputExport(output, builtInId, m_pipelineState->getRasterizerState().rasterStream, builder);
+        const unsigned streamId = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
+        patchGsBuiltInOutputExport(output, builtInId, streamId, builder);
         break;
       }
       case ShaderStage::Mesh: {
@@ -967,7 +967,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
         origLocInfo.setComponent(component);
         auto locInfoMapIt = resUsage->inOutUsage.outputLocInfoMap.find(origLocInfo);
 
-        if (m_pipelineState->canPackOutput(m_shaderStage)) {
+        if (m_pipelineState->canPackOutput(m_shaderStage.value())) {
           if (locInfoMapIt != resUsage->inOutUsage.outputLocInfoMap.end()) {
             loc = locInfoMapIt->second.getLocation();
             elemIdx = locInfoMapIt->second.getComponent();
@@ -985,7 +985,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
         // NOTE: Some outputs are not used by next shader stage. They must have been removed already.
         assert(loc != InvalidValue);
 
-        switch (m_shaderStage) {
+        switch (m_shaderStage.value()) {
         case ShaderStage::Vertex: {
           assert(callInst.arg_size() == 3);
           if (elemIdx == InvalidValue)
@@ -1085,8 +1085,23 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
         if (m_pipelineState->getShaderModes()->getGeometryShaderMode().robustGsEmits) {
           auto totalEmitCounterPtr = m_pipelineSysValues.get(m_entryPoint)->getTotalEmitCounterPtr();
           Value *totalEmitCounter = builder.CreateLoad(builder.getInt32Ty(), totalEmitCounterPtr);
+
+          // totalEmitCounter++
           totalEmitCounter = builder.CreateAdd(totalEmitCounter, builder.getInt32(1));
           builder.CreateStore(totalEmitCounter, totalEmitCounterPtr);
+
+          if (m_gfxIp.major < 11) {
+            // NOTE: For pre-GFX11, the counters of primitives written are driven by the message GS_EMIT/GS_CUT.
+            // Therefore, we must send such message conditionally by checking if the emit is within expected range.
+
+            // validEmit = totalEmitCounter <= outputVertices
+            const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
+            auto validEmit = builder.CreateICmpULE(totalEmitCounter, builder.getInt32(geometryMode.outputVertices));
+
+            // Send the GS_EMIT message conditionally
+            builder.CreateIf(validEmit, false);
+            callInst.moveBefore(&*builder.GetInsertPoint());
+          }
         }
       }
     }
@@ -1099,10 +1114,10 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
 // @param retInst : "Ret" instruction
 void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
   // We only handle the "ret" of shader entry point
-  if (m_shaderStage == ShaderStage::Invalid)
+  if (!m_shaderStage)
     return;
 
-  const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
+  const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value());
 
   // Whether this shader stage has to use "exp" instructions to export outputs
   const bool useExpInst = ((m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
@@ -1189,7 +1204,7 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
     unsigned clipDistanceCount = 0;
     unsigned cullDistanceCount = 0;
 
-    auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
+    auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
 
     if (m_shaderStage == ShaderStage::Vertex) {
       auto &builtInUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Vertex)->builtInUsage.vs;
@@ -1529,7 +1544,7 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
       // If we are building unlinked relocatable shaders, it is possible there are
       // generic outputs that are not written to.  We need to count them in
       // the export count.
-      auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage);
+      auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value());
       for (const auto &locInfoPair : resUsage->inOutUsage.outputLocInfoMap) {
         const unsigned newLoc = locInfoPair.second.getLocation();
         if (m_expLocs.count(newLoc) != 0)
@@ -3322,6 +3337,9 @@ void PatchInOutImportExport::patchTesBuiltInOutputExport(Value *output, unsigned
 // @param builder : the builder to use
 void PatchInOutImportExport::patchGsBuiltInOutputExport(Value *output, unsigned builtInId, unsigned streamId,
                                                         BuilderBase &builder) {
+  if (streamId != m_pipelineState->getRasterizerState().rasterStream)
+    return; // Skip built-in export if this stream is not the rasterization stream.
+
   const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry);
   const auto &builtInUsage = resUsage->builtInUsage.gs;
   const auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap;
@@ -3781,7 +3799,7 @@ void PatchInOutImportExport::storeValueToStreamOutBuffer(Value *storeValue, unsi
   Value *writeIndex = nullptr;
   Value *streamOffset = nullptr;
 
-  const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs;
+  const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs;
   if (m_shaderStage == ShaderStage::Vertex) {
     streamInfo = getFunctionArgument(m_entryPoint, entryArgIdxs.vs.streamOutData.streamInfo);
     writeIndex = getFunctionArgument(m_entryPoint, entryArgIdxs.vs.streamOutData.writeIndex);
@@ -3818,7 +3836,7 @@ void PatchInOutImportExport::storeValueToStreamOutBuffer(Value *storeValue, unsi
   streamOffset = builder.CreateShl(streamOffset, 2);
 
   // GPU will drop stream-out buffer store when the thread ID is invalid (OOB_select is set to SQ_OOB_INDEX_ONLY).
-  const unsigned outOfRangeWriteIndex = InvalidValue - (m_pipelineState->getShaderWaveSize(m_shaderStage) - 1);
+  const unsigned outOfRangeWriteIndex = InvalidValue - (m_pipelineState->getShaderWaveSize(m_shaderStage.value()) - 1);
   // validStreamOutVertex = threadId < streamOutVertexCount
   auto validStreamOutVertex = builder.CreateICmpULT(m_threadId, streamOutVertexCount);
   // writeIndex = validStreamOutVertex ? writeIndex : outOfRangeWriteIndex
@@ -3893,7 +3911,7 @@ void PatchInOutImportExport::storeValueToEsGsRing(Value *storeValue, unsigned lo
     }
 
     // Call buffer store intrinsic or LDS store
-    const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs;
+    const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs;
     Value *esGsOffset = nullptr;
     if (m_shaderStage == ShaderStage::Vertex)
       esGsOffset = getFunctionArgument(m_entryPoint, entryArgIdxs.vs.esGsOffset);
@@ -4022,7 +4040,7 @@ void PatchInOutImportExport::storeValueToGsVsRing(Value *storeValue, unsigned lo
         storeValue = builder.CreateBitCast(storeValue, builder.getInt32Ty());
     }
 
-    const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs;
+    const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs;
     Value *gsVsOffset = getFunctionArgument(m_entryPoint, entryArgIdxs.gs.gsVsOffset);
 
     auto emitCounterPair = m_pipelineSysValues.get(m_entryPoint)->getEmitCounterPtr();
@@ -4224,8 +4242,8 @@ Value *PatchInOutImportExport::readValueFromLds(bool offChip, Type *readTy, Valu
     // Read from off-chip LDS buffer
     const auto &offChipLdsBaseArgIdx =
         m_shaderStage == ShaderStage::TessEval
-            ? m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs.tes.offChipLdsBase
-            : m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs.tcs.offChipLdsBase;
+            ? m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs.tes.offChipLdsBase
+            : m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs.tcs.offChipLdsBase;
 
     auto offChipLdsDesc = m_pipelineSysValues.get(m_entryPoint)->getOffChipLdsDesc();
 
@@ -4322,7 +4340,7 @@ void PatchInOutImportExport::writeValueToLds(bool offChip, Value *writeValue, Va
 
   if (offChip) {
     // Write to off-chip LDS buffer
-    auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs.tcs;
+    auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs.tcs;
 
     auto offChipLdsBase = getFunctionArgument(m_entryPoint, entryArgIdxs.offChipLdsBase);
     // Convert dword off-chip LDS offset to byte offset
@@ -4528,7 +4546,7 @@ Value *PatchInOutImportExport::calcLdsOffsetForTesInput(Type *inputTy, unsigned
   auto outPatchStart = calcFactor.offChip.outPatchStart;
   auto patchConstStart = calcFactor.offChip.patchConstStart;
 
-  const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs.tes;
+  const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs.tes;
 
   auto relPatchId = getFunctionArgument(m_entryPoint, entryArgIdxs.relPatchId);
 
@@ -4680,7 +4698,7 @@ unsigned PatchInOutImportExport::calcPatchCountPerThreadGroup(unsigned inVertexC
 void PatchInOutImportExport::addExportInstForGenericOutput(Value *output, unsigned location, unsigned compIdx,
                                                            BuilderBase &builder) {
   // Check if the shader stage is valid to use "exp" instruction to export output
-  const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
+  const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value());
   const bool useExpInst = ((m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
                             m_shaderStage == ShaderStage::CopyShader) &&
                            (!nextStage || nextStage == ShaderStage::Fragment));
@@ -4874,7 +4892,7 @@ Value *PatchInOutImportExport::getSubgroupLocalInvocationId(BuilderBase &builder
   Value *subgroupLocalInvocationId =
       builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {builder.getInt32(-1), builder.getInt32(0)});
 
-  unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
+  unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
   if (waveSize == 64) {
     subgroupLocalInvocationId =
         builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {builder.getInt32(-1), subgroupLocalInvocationId});
@@ -5401,7 +5419,7 @@ void PatchInOutImportExport::recordVertexAttribExport(unsigned location, ArrayRe
     m_attribExports[location][i] = attribValues[i]; // Update values that are valid
   }
 
-  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
+  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
   inOutUsage.expCount = std::max(inOutUsage.expCount, location + 1); // Update export count
 }
 
@@ -5413,7 +5431,7 @@ void PatchInOutImportExport::exportVertexAttribs(BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
          m_shaderStage == ShaderStage::CopyShader); // Valid shader stages
   if (m_attribExports.empty()) {
-    assert(m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage.expCount == 0);
+    assert(m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage.expCount == 0);
     return;
   }
 
diff --git a/lgc/patch/PatchInvariantLoads.cpp b/lgc/patch/LowerInvariantLoads.cpp
similarity index 97%
rename from lgc/patch/PatchInvariantLoads.cpp
rename to lgc/patch/LowerInvariantLoads.cpp
index 651d6a5ec7..770c717218 100644
--- a/lgc/patch/PatchInvariantLoads.cpp
+++ b/lgc/patch/LowerInvariantLoads.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchInvariantLoads.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchInvariantLoads.
+ * @file  LowerInvariantLoads.cpp
+ * @brief LLPC source file: contains implementation of class lgc::LowerInvariantLoads.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchInvariantLoads.h"
+#include "lgc/patch/LowerInvariantLoads.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
@@ -37,7 +37,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 
-#define DEBUG_TYPE "lgc-patch-invariant-loads"
+#define DEBUG_TYPE "lgc-lower-invariant-loads"
 
 using namespace llvm;
 using namespace lgc;
@@ -83,12 +83,12 @@ static unsigned findAddressSpaceAccess(const Instruction *inst) {
 // @param [in/out] function : Function that we will patch.
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchInvariantLoads::run(Function &function, FunctionAnalysisManager &analysisManager) {
+PreservedAnalyses LowerInvariantLoads::run(Function &function, FunctionAnalysisManager &analysisManager) {
   const auto &moduleAnalysisManager = analysisManager.getResult<ModuleAnalysisManagerFunctionProxy>(function);
   PipelineState *pipelineState =
       moduleAnalysisManager.getCachedResult<PipelineStateWrapper>(*function.getParent())->getPipelineState();
 
-  LLVM_DEBUG(dbgs() << "Run the pass Patch-Invariant-Loads\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Invariant-Loads\n");
 
   auto shaderStage = lgc::getShaderStage(&function);
   if (!shaderStage)
diff --git a/lgc/patch/PatchMulDx9Zero.cpp b/lgc/patch/LowerMulDx9Zero.cpp
similarity index 99%
rename from lgc/patch/PatchMulDx9Zero.cpp
rename to lgc/patch/LowerMulDx9Zero.cpp
index b9cdb6f537..fa9495121d 100644
--- a/lgc/patch/PatchMulDx9Zero.cpp
+++ b/lgc/patch/LowerMulDx9Zero.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchMulDx9Zero.cpp
+ * @file  LowerMulDx9Zero.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchMulDx9Zero.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchMulDx9Zero.h"
+#include "lgc/patch/LowerMulDx9Zero.h"
 #include "lgc/state/PipelineShaders.h"
 #include "lgc/state/PipelineState.h"
 #include "llvm/IR/Constants.h"
diff --git a/lgc/patch/MeshTaskShader.cpp b/lgc/patch/MeshTaskShader.cpp
index 0fcd139544..c289b82595 100644
--- a/lgc/patch/MeshTaskShader.cpp
+++ b/lgc/patch/MeshTaskShader.cpp
@@ -58,12 +58,6 @@ MeshTaskShader::MeshTaskShader(PipelineState *pipelineState,
   m_pipelineSysValues.initialize(m_pipelineState);
 }
 
-// =====================================================================================================================
-// Destructor
-MeshTaskShader::~MeshTaskShader() {
-  m_pipelineSysValues.clear();
-}
-
 // =====================================================================================================================
 // Layout mesh shader LDS if 'ldsLayout' is specified and calculate the required total LDS size (in dwords).
 //
diff --git a/lgc/patch/MeshTaskShader.h b/lgc/patch/MeshTaskShader.h
index 84ed5b1b12..35075fdf07 100644
--- a/lgc/patch/MeshTaskShader.h
+++ b/lgc/patch/MeshTaskShader.h
@@ -86,7 +86,6 @@ struct MeshOutputsLayout {
 class MeshTaskShader {
 public:
   MeshTaskShader(PipelineState *pipelineState, PatchPreparePipelineAbi::FunctionAnalysisHandlers *analysisHandlers);
-  ~MeshTaskShader();
 
   static unsigned layoutMeshShaderLds(PipelineState *pipelineState, llvm::Function *entryPoint,
                                       MeshLdsLayout *ldsLayout = nullptr, MeshOutputsLayout *outputsLayout = nullptr);
diff --git a/lgc/patch/PatchEntryPointMutate.cpp b/lgc/patch/MutateEntryPoint.cpp
similarity index 94%
rename from lgc/patch/PatchEntryPointMutate.cpp
rename to lgc/patch/MutateEntryPoint.cpp
index 2e0808f118..f69f32b56a 100644
--- a/lgc/patch/PatchEntryPointMutate.cpp
+++ b/lgc/patch/MutateEntryPoint.cpp
@@ -24,8 +24,8 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchEntryPointMutate.cpp
- * @brief The lgc::PatchEntryPointMutate pass determines the final user data layout of shaders.
+ * @file  MutateEntryPoint.cpp
+ * @brief The lgc::MutateEntryPoint pass determines the final user data layout of shaders.
  *
  * This consists of
  * - removing unused user data
@@ -53,7 +53,7 @@
  ***********************************************************************************************************************
  */
 
-#include "lgc/patch/PatchEntryPointMutate.h"
+#include "lgc/patch/MutateEntryPoint.h"
 #include "ShaderMerger.h"
 #include "compilerutils/CompilerUtils.h"
 #include "lgc/LgcContext.h"
@@ -80,21 +80,21 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <optional>
 
-#define DEBUG_TYPE "lgc-patch-entry-point-mutate"
+#define DEBUG_TYPE "lgc-mutate-entry-point"
 
 using namespace llvm;
 using namespace lgc;
 using namespace cps;
 
 // =====================================================================================================================
-PatchEntryPointMutate::PatchEntryPointMutate()
+MutateEntryPoint::MutateEntryPoint()
     : m_hasTs(false), m_hasGs(false),
       m_setInactiveChainArgId(Function::lookupIntrinsicID("llvm.amdgcn.set.inactive.chain.arg")) {
 }
 
 // =====================================================================================================================
-PatchEntryPointMutate::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::Twine &name, unsigned userDataValue,
-                                                unsigned *argIndex)
+MutateEntryPoint::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::Twine &name, unsigned userDataValue,
+                                           unsigned *argIndex)
     : argTy(argTy), name(name.str()), userDataValue(userDataValue), argIndex(argIndex) {
   if (llvm::isa<llvm::PointerType>(argTy))
     argDwordSize = argTy->getPointerAddressSpace() == ADDR_SPACE_CONST_32BIT ? 1 : 2;
@@ -103,8 +103,8 @@ PatchEntryPointMutate::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::T
 }
 
 // =====================================================================================================================
-PatchEntryPointMutate::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::Twine &name,
-                                                UserDataMapping userDataValue, unsigned *argIndex)
+MutateEntryPoint::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::Twine &name, UserDataMapping userDataValue,
+                                           unsigned *argIndex)
     : UserDataArg(argTy, name, static_cast<unsigned>(userDataValue), argIndex) {
 }
 
@@ -114,11 +114,11 @@ PatchEntryPointMutate::UserDataArg::UserDataArg(llvm::Type *argTy, const llvm::T
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchEntryPointMutate::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses MutateEntryPoint::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
   PipelineShadersResult &pipelineShaders = analysisManager.getResult<PipelineShaders>(module);
 
-  LLVM_DEBUG(dbgs() << "Run the pass Patch-Entry-Point-Mutate\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Mutate-Entry-Point\n");
 
   Patch::init(&module);
 
@@ -140,13 +140,13 @@ PreservedAnalyses PatchEntryPointMutate::run(Module &module, ModuleAnalysisManag
 
   if (m_pipelineState->isGraphics()) {
     // Process each shader in turn, but not the copy shader.
-    for (unsigned shaderStage = 0; shaderStage < ShaderStage::NativeStageCount; ++shaderStage) {
-      m_entryPoint = pipelineShaders.getEntryPoint(static_cast<ShaderStageEnum>(shaderStage));
+    for (auto stage : ShaderStagesNative) {
+      m_entryPoint = pipelineShaders.getEntryPoint(stage);
       if (m_entryPoint) {
         // ToDo: This should always be skipped since we don't implement CPS metadata yet.
         assert(!lgc::cps::isCpsFunction(*m_entryPoint) && "CPS support not implemented yet");
 
-        m_shaderStage = static_cast<ShaderStageEnum>(shaderStage);
+        m_shaderStage = stage;
         processShader(&shaderInputs);
       }
     }
@@ -260,12 +260,12 @@ static Value *mergeDwordsIntoVector(IRBuilder<> &builder, ArrayRef<Value *> inpu
 // Process LoadDriverTableEntryOp.
 //
 // @param module : LLVM module
-void PatchEntryPointMutate::processDriverTableLoad(Module &module) {
+void MutateEntryPoint::processDriverTableLoad(Module &module) {
   SmallVector<CallInst *> callsToRemove;
 
   struct Payload {
     SmallVectorImpl<CallInst *> &callsToRemove;
-    PatchEntryPointMutate *self;
+    MutateEntryPoint *self;
   };
 
   Payload payload = {callsToRemove, this};
@@ -287,7 +287,7 @@ void PatchEntryPointMutate::processDriverTableLoad(Module &module) {
 // Lower LoadDriverTableEntryOp.
 //
 // @param loadDriverTablePtrOp : Call instruction to load driver table pointer
-void PatchEntryPointMutate::lowerDriverTableLoad(LoadDriverTableEntryOp &loadDriverTablePtrOp) {
+void MutateEntryPoint::lowerDriverTableLoad(LoadDriverTableEntryOp &loadDriverTablePtrOp) {
   BuilderBase builder(&loadDriverTablePtrOp);
   Function *entryPoint = loadDriverTablePtrOp.getFunction();
   builder.SetInsertPoint(&loadDriverTablePtrOp);
@@ -304,12 +304,12 @@ void PatchEntryPointMutate::lowerDriverTableLoad(LoadDriverTableEntryOp &loadDri
 // Process GroupMemcpyOp.
 //
 // @param module : LLVM module
-void PatchEntryPointMutate::processGroupMemcpy(Module &module) {
+void MutateEntryPoint::processGroupMemcpy(Module &module) {
   SmallVector<CallInst *> callsToRemove;
 
   struct Payload {
     SmallVectorImpl<CallInst *> &callsToRemove;
-    PatchEntryPointMutate *self;
+    MutateEntryPoint *self;
   };
 
   Payload payload = {callsToRemove, this};
@@ -331,7 +331,7 @@ void PatchEntryPointMutate::processGroupMemcpy(Module &module) {
 // Lower GroupMemcpyOp - Copy memory using threads in a workgroup (scope=2) or subgroup (scope=3).
 //
 // @param groupMemcpyOp : Call instruction to do group memory copy
-void PatchEntryPointMutate::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) {
+void MutateEntryPoint::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) {
   BuilderImpl builder(m_pipelineState);
   Function *entryPoint = groupMemcpyOp.getFunction();
   auto stage = getShaderStage(entryPoint);
@@ -512,7 +512,7 @@ void PatchEntryPointMutate::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) {
 // Lower as.continuation.reference call.
 //
 // @param asCpsReferenceOp: the instruction
-void PatchEntryPointMutate::lowerAsCpsReference(cps::AsContinuationReferenceOp &asCpsReferenceOp) {
+void MutateEntryPoint::lowerAsCpsReference(cps::AsContinuationReferenceOp &asCpsReferenceOp) {
   BuilderBase builder(&asCpsReferenceOp);
 
   Value *reloc = nullptr;
@@ -534,14 +534,14 @@ void PatchEntryPointMutate::lowerAsCpsReference(cps::AsContinuationReferenceOp &
 // @param shaderInputs: the ShaderInputs information for the parent function. This is only used for continufy based
 // continuation transform, under which we still need to pass ShaderInput arguments(WorkgroupId/LocalInvocationId) during
 // cps chain call.
-bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInputs) {
+bool MutateEntryPoint::lowerCpsOps(Function *func, ShaderInputs *shaderInputs) {
   SmallVector<cps::JumpOp *> cpsJumps;
   SmallVector<CallInst *> tobeErased;
 
   struct Payload {
     SmallVectorImpl<cps::JumpOp *> &jumps;
     SmallVectorImpl<CallInst *> &tobeErased;
-    PatchEntryPointMutate *self;
+    MutateEntryPoint *self;
   };
   Payload payload = {cpsJumps, tobeErased, this};
 
@@ -657,7 +657,7 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInpu
   //      Jump to next cps function.
   //    ret:
   //      ret void
-  unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
+  unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
   Type *waveMaskTy = builder.getIntNTy(waveSize);
   // For continufy based continuation, the vgpr list: LocalInvocationId(optional), vcr, vsp, ...
   unsigned vcrIndexInVgpr = haveLocalInvocationId ? 1 : 0;
@@ -791,8 +791,8 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInpu
 // @param func : the cps function to be mutated
 // @param fixedShaderArgTys : the types of the fixed shader arguments(userdata + possibly shader inputs)
 // @param argNames : the name string of the fixed shader arguments
-Function *PatchEntryPointMutate::lowerCpsFunction(Function *func, ArrayRef<Type *> fixedShaderArgTys,
-                                                  ArrayRef<std::string> argNames) {
+Function *MutateEntryPoint::lowerCpsFunction(Function *func, ArrayRef<Type *> fixedShaderArgTys,
+                                             ArrayRef<std::string> argNames) {
   Value *state = func->getArg(0);
   const DataLayout &layout = func->getParent()->getDataLayout();
   IRBuilder<> builder(func->getContext());
@@ -889,8 +889,7 @@ Function *PatchEntryPointMutate::lowerCpsFunction(Function *func, ArrayRef<Type
 // @param builder: IRBuilder to build instructions
 // @param waveMaskTy : Wave Mask type
 // @param priorties : Priorities list
-Value *PatchEntryPointMutate::takeLevel(Value *level, IRBuilder<> &builder, Type *waveMaskTy,
-                                        ArrayRef<CpsLevel> priorties) {
+Value *MutateEntryPoint::takeLevel(Value *level, IRBuilder<> &builder, Type *waveMaskTy, ArrayRef<CpsLevel> priorties) {
   auto levelMask = builder.CreateICmpNE(level, builder.getInt32(0));
   Value *levelBallot = builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, waveMaskTy, levelMask);
   Value *cond = nullptr;
@@ -911,8 +910,8 @@ Value *PatchEntryPointMutate::takeLevel(Value *level, IRBuilder<> &builder, Type
 // @param parent : the parent function of the cps.jump operation
 // @param jumpOp : the call instruction of cps.jump
 // @param [in/out] exitInfos : the vector of cps exit information to be filled
-unsigned PatchEntryPointMutate::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, BasicBlock *tailBlock,
-                                             SmallVectorImpl<CpsExitInfo> &exitInfos) {
+unsigned MutateEntryPoint::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, BasicBlock *tailBlock,
+                                        SmallVectorImpl<CpsExitInfo> &exitInfos) {
   IRBuilder<> builder(parent->getContext());
   const DataLayout &layout = parent->getParent()->getDataLayout();
   // Translate @lgc.cps.jump(CR %target, i32 %levels, T %state, ...) into:
@@ -965,7 +964,7 @@ unsigned PatchEntryPointMutate::lowerCpsJump(Function *parent, cps::JumpOp *jump
 // are potentially used in other functions. It also modifies each call to pass the shader inputs between functions.
 //
 // @param module : IR module
-void PatchEntryPointMutate::setupComputeWithCalls(Module *module) {
+void MutateEntryPoint::setupComputeWithCalls(Module *module) {
   m_computeWithCalls = false;
 
   if (m_pipelineState->isComputeLibrary()) {
@@ -1000,11 +999,11 @@ void PatchEntryPointMutate::setupComputeWithCalls(Module *module) {
 // Gather user data usage in all shaders
 //
 // @param module : IR module
-void PatchEntryPointMutate::gatherUserDataUsage(Module *module) {
+void MutateEntryPoint::gatherUserDataUsage(Module *module) {
   // Gather special ops requiring user data.
   static const auto visitor =
-      llvm_dialects::VisitorBuilder<PatchEntryPointMutate>()
-          .add<UserDataOp>([](PatchEntryPointMutate &self, UserDataOp &op) {
+      llvm_dialects::VisitorBuilder<MutateEntryPoint>()
+          .add<UserDataOp>([](MutateEntryPoint &self, UserDataOp &op) {
             auto stage = getShaderStage(op.getFunction());
             assert(stage != ShaderStage::CopyShader);
             auto userDataUsage = self.getUserDataUsage(stage.value());
@@ -1064,7 +1063,7 @@ void PatchEntryPointMutate::gatherUserDataUsage(Module *module) {
               self.m_pipelineState->getPalMetadata()->setUserDataSpillUsage(op.getOffset() / 4, stage);
             }
           })
-          .add<LoadUserDataOp>([](PatchEntryPointMutate &self, LoadUserDataOp &op) {
+          .add<LoadUserDataOp>([](MutateEntryPoint &self, LoadUserDataOp &op) {
             auto stage = getShaderStage(op.getFunction());
             assert(stage != ShaderStage::CopyShader);
             auto *userDataUsage = self.getUserDataUsage(stage.value());
@@ -1111,8 +1110,8 @@ void PatchEntryPointMutate::gatherUserDataUsage(Module *module) {
 
 // =====================================================================================================================
 // Load a value of a simple type from user data at the given dwordOffset.
-Value *PatchEntryPointMutate::loadUserData(const UserDataUsage &userDataUsage, Value *spillTable, Type *type,
-                                           unsigned dwordOffset, BuilderBase &builder) {
+Value *MutateEntryPoint::loadUserData(const UserDataUsage &userDataUsage, Value *spillTable, Type *type,
+                                      unsigned dwordOffset, BuilderBase &builder) {
   Function *func = builder.GetInsertBlock()->getParent();
   unsigned dwordSize = m_module->getDataLayout().getTypeStoreSize(type) / 4;
   if (dwordOffset + dwordSize <= userDataUsage.entryArgIdxs.size()) {
@@ -1163,7 +1162,7 @@ Value *PatchEntryPointMutate::loadUserData(const UserDataUsage &userDataUsage, V
 //    spilled.
 //
 // @param module : IR module
-void PatchEntryPointMutate::fixupUserDataUses(Module &module) {
+void MutateEntryPoint::fixupUserDataUses(Module &module) {
   BuilderBase builder(module.getContext());
 
   // For each function definition...
@@ -1252,7 +1251,7 @@ void PatchEntryPointMutate::fixupUserDataUses(Module &module) {
 // Process a single shader
 //
 // @param shaderInputs : ShaderInputs object representing hardware-provided shader inputs
-void PatchEntryPointMutate::processShader(ShaderInputs *shaderInputs) {
+void MutateEntryPoint::processShader(ShaderInputs *shaderInputs) {
   // Create new entry-point from the original one
   SmallVector<Type *, 8> argTys;
   SmallVector<std::string, 8> argNames;
@@ -1265,7 +1264,7 @@ void PatchEntryPointMutate::processShader(ShaderInputs *shaderInputs) {
       addFunctionArgs(origEntryPoint, origEntryPoint->getFunctionType()->getReturnType(), argTys, argNames, inRegMask);
 
   // We always deal with pre-merge functions here, so set the fitting pre-merge calling conventions.
-  switch (m_shaderStage) {
+  switch (m_shaderStage.value()) {
   case ShaderStage::Task:
     entryPoint->setCallingConv(CallingConv::AMDGPU_CS);
     break;
@@ -1311,7 +1310,7 @@ void PatchEntryPointMutate::processShader(ShaderInputs *shaderInputs) {
 //
 // @param shaderInputs : ShaderInputs object representing hardware-provided shader inputs
 // @param [in/out] module : Module
-void PatchEntryPointMutate::processComputeFuncs(ShaderInputs *shaderInputs, Module &module) {
+void MutateEntryPoint::processComputeFuncs(ShaderInputs *shaderInputs, Module &module) {
   m_shaderStage = ShaderStage::Compute;
 
   // We no longer support compute shader fixed layout required before PAL interface version 624.
@@ -1398,9 +1397,8 @@ void PatchEntryPointMutate::processComputeFuncs(ShaderInputs *shaderInputs, Modu
 // Process all real function calls and passes arguments to them.
 //
 // @param [in/out] module : Module
-void PatchEntryPointMutate::processCalls(Function &func, ArrayRef<Type *> shaderInputTys,
-                                         ArrayRef<std::string> shaderInputNames, uint64_t inRegMask,
-                                         unsigned argOffset) {
+void MutateEntryPoint::processCalls(Function &func, ArrayRef<Type *> shaderInputTys,
+                                    ArrayRef<std::string> shaderInputNames, uint64_t inRegMask, unsigned argOffset) {
   // This is one of:
   // - a compute pipeline with non-inlined functions;
   // - a compute pipeline with calls to library functions;
@@ -1457,7 +1455,7 @@ void PatchEntryPointMutate::processCalls(Function &func, ArrayRef<Type *> shader
 
 // =====================================================================================================================
 // Set Attributes on new function
-void PatchEntryPointMutate::setFuncAttrs(Function *entryPoint) {
+void MutateEntryPoint::setFuncAttrs(Function *entryPoint) {
   AttrBuilder builder(entryPoint->getContext());
   if (m_shaderStage == ShaderStage::Fragment) {
     auto &builtInUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Fragment)->builtInUsage.fs;
@@ -1520,8 +1518,8 @@ void PatchEntryPointMutate::setFuncAttrs(Function *entryPoint) {
   }
 
   // Set VGPR, SGPR, and wave limits
-  auto shaderOptions = &m_pipelineState->getShaderOptions(m_shaderStage);
-  auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage);
+  auto shaderOptions = &m_pipelineState->getShaderOptions(m_shaderStage.value());
+  auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value());
 
   unsigned vgprLimit = shaderOptions->vgprLimit;
   unsigned sgprLimit = shaderOptions->sgprLimit;
@@ -1552,7 +1550,7 @@ void PatchEntryPointMutate::setFuncAttrs(Function *entryPoint) {
       // Graphics shader stages don't have thread groups at an API level
       tgSize = 1;
     }
-    unsigned numWavesPerTg = divideCeil(tgSize, m_pipelineState->getShaderWaveSize(m_shaderStage));
+    unsigned numWavesPerTg = divideCeil(tgSize, m_pipelineState->getShaderWaveSize(m_shaderStage.value()));
     unsigned maxWavesPerCu = numWavesPerTg * shaderOptions->maxThreadGroupsPerComputeUnit;
     unsigned maxWavesPerSimd = divideCeil(maxWavesPerCu, 2);
     std::string wavesPerEu = std::string("1,") + std::to_string(maxWavesPerSimd);
@@ -1627,14 +1625,14 @@ void PatchEntryPointMutate::setFuncAttrs(Function *entryPoint) {
 // @returns inRegMask : "Inreg" bit mask for the arguments, with a bit set to indicate that the corresponding
 //                          arg needs to have an "inreg" attribute to put the arg into SGPRs rather than VGPRs
 //
-uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInputs, Function *origFunc,
-                                                         SmallVectorImpl<Type *> &argTys,
-                                                         SmallVectorImpl<std::string> &argNames, unsigned argOffset,
-                                                         bool updateUserDataMap) {
+uint64_t MutateEntryPoint::generateEntryPointArgTys(ShaderInputs *shaderInputs, Function *origFunc,
+                                                    SmallVectorImpl<Type *> &argTys,
+                                                    SmallVectorImpl<std::string> &argNames, unsigned argOffset,
+                                                    bool updateUserDataMap) {
 
   uint64_t inRegMask = 0;
   IRBuilder<> builder(*m_context);
-  auto intfData = m_pipelineState->getShaderInterfaceData(m_shaderStage);
+  auto intfData = m_pipelineState->getShaderInterfaceData(m_shaderStage.value());
   auto &entryArgIdxs = intfData->entryArgIdxs;
   entryArgIdxs.initialized = true;
 
@@ -1677,7 +1675,7 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp
              "Expecting descriptor set values to be one dword.  The linker cannot handle anything else.");
       if (isSystemUserData) {
         unsigned index = userDataArg.userDataValue - static_cast<unsigned>(UserDataMapping::GlobalTable);
-        auto &specialUserData = getUserDataUsage(m_shaderStage)->specialUserData;
+        auto &specialUserData = getUserDataUsage(m_shaderStage.value())->specialUserData;
         if (index < specialUserData.size())
           specialUserData[index].entryArgIdx = argTys.size() + argOffset;
       }
@@ -1693,7 +1691,7 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp
 
     // Only applies to wave32
     // TODO: Can we further exclude PS if LDS_GROUP_SIZE == 0
-    if (m_pipelineState->getShaderWaveSize(m_shaderStage) == 32 &&
+    if (m_pipelineState->getShaderWaveSize(m_shaderStage.value()) == 32 &&
         (m_shaderStage == ShaderStage::Compute || m_shaderStage == ShaderStage::Fragment ||
          m_shaderStage == ShaderStage::Mesh)) {
       unsigned userDataLimit = m_shaderStage == ShaderStage::Mesh ? 8 : 16;
@@ -1711,8 +1709,8 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp
 
   // Push the fixed system (not user data) register args.
   if (shaderInputs)
-    inRegMask |= shaderInputs->getShaderArgTys(m_pipelineState, m_shaderStage, origFunc, m_computeWithCalls, argTys,
-                                               argNames, argOffset);
+    inRegMask |= shaderInputs->getShaderArgTys(m_pipelineState, m_shaderStage.value(), origFunc, m_computeWithCalls,
+                                               argTys, argNames, argOffset);
 
   if (updateUserDataMap) {
     constexpr unsigned NumUserSgprs = 32;
@@ -1732,7 +1730,7 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp
       }
       userDataIdx += dwordSize;
     }
-    m_pipelineState->setUserDataMap(m_shaderStage, userDataMap);
+    m_pipelineState->setUserDataMap(m_shaderStage.value(), userDataMap);
   }
 
   return inRegMask;
@@ -1741,7 +1739,7 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp
 // =====================================================================================================================
 // @param userDataValue : The value to be written into a user data entry.
 // @returns : True if the user data value corresponds to a special system user data value.
-bool PatchEntryPointMutate::isSystemUserDataValue(unsigned userDataValue) const {
+bool MutateEntryPoint::isSystemUserDataValue(unsigned userDataValue) const {
   if (userDataValue < static_cast<unsigned>(UserDataMapping::GlobalTable)) {
     return false;
   }
@@ -1751,7 +1749,7 @@ bool PatchEntryPointMutate::isSystemUserDataValue(unsigned userDataValue) const
 // =====================================================================================================================
 // @param userDataValue : The value to be written into a user data entry.
 // @returns : True if the user data value corresponds to an unlinked descriptor set.
-bool PatchEntryPointMutate::isUnlinkedDescriptorSetValue(unsigned userDataValue) const {
+bool MutateEntryPoint::isUnlinkedDescriptorSetValue(unsigned userDataValue) const {
   if (userDataValue < static_cast<unsigned>(UserDataMapping::DescriptorSet0)) {
     return false;
   }
@@ -1761,19 +1759,18 @@ bool PatchEntryPointMutate::isUnlinkedDescriptorSetValue(unsigned userDataValue)
 // =====================================================================================================================
 // Add a UserDataArg to the appropriate vector for each special argument (e.g. ViewId) needed in user data SGPRs.
 // In here, we need to check whether an argument is needed in two ways:
-// 1. Whether a flag is set saying it will be needed after PatchEntryPointMutate
+// 1. Whether a flag is set saying it will be needed after MutateEntryPoint
 // 2. Whether there is an actual use of the special user data value (lgc.special.user.data call) generated
-//    before PatchEntryPointMutate, which we check with userDataUsage->isSpecialUserDataUsed().
+//    before MutateEntryPoint, which we check with userDataUsage->isSpecialUserDataUsed().
 //
 // @param userDataArgs : Vector to add args to when they need to go before user data nodes (just streamout)
 // @param specialUserDataArgs : Vector to add args to when they need to go after user data nodes (all the rest)
 // @param builder : IRBuilder to get types from
-void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl<UserDataArg> &userDataArgs,
-                                                   SmallVectorImpl<UserDataArg> &specialUserDataArgs,
-                                                   IRBuilder<> &builder) {
+void MutateEntryPoint::addSpecialUserDataArgs(SmallVectorImpl<UserDataArg> &userDataArgs,
+                                              SmallVectorImpl<UserDataArg> &specialUserDataArgs, IRBuilder<> &builder) {
 
-  auto userDataUsage = getUserDataUsage(m_shaderStage);
-  auto intfData = m_pipelineState->getShaderInterfaceData(m_shaderStage);
+  auto userDataUsage = getUserDataUsage(m_shaderStage.value());
+  auto intfData = m_pipelineState->getShaderInterfaceData(m_shaderStage.value());
   auto &entryArgIdxs = intfData->entryArgIdxs;
   bool enableNgg = m_pipelineState->isGraphics() ? m_pipelineState->getNggControl()->enableNgg : false;
 
@@ -1788,7 +1785,7 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl<UserDataArg>
     if (m_pipelineState->getInputAssemblyState().multiView != MultiViewMode::Disable) {
       unsigned *argIdx = nullptr;
       auto userDataValue = UserDataMapping::ViewId;
-      switch (m_shaderStage) {
+      switch (m_shaderStage.value()) {
       case ShaderStage::Vertex:
         argIdx = &entryArgIdxs.vs.viewId;
         break;
@@ -1807,7 +1804,7 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl<UserDataArg>
       specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "viewId", userDataValue, argIdx));
     }
 
-    if (getMergedShaderStage(m_shaderStage) == getMergedShaderStage(ShaderStage::Vertex)) {
+    if (getMergedShaderStage(m_shaderStage.value()) == getMergedShaderStage(ShaderStage::Vertex)) {
       // This is the VS, or the shader that VS is merged into on GFX9+.
       auto vsIntfData = m_pipelineState->getShaderInterfaceData(ShaderStage::Vertex);
       auto vsResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Vertex);
@@ -1917,7 +1914,7 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl<UserDataArg>
       // If no NGG, stream out table will be set to copy shader's user data entry, we should not set it duplicately.
       unsigned *tablePtr = nullptr;
 
-      switch (m_shaderStage) {
+      switch (m_shaderStage.value()) {
       case ShaderStage::Vertex:
         tablePtr = &intfData->entryArgIdxs.vs.streamOutData.tablePtr;
         break;
@@ -1951,7 +1948,7 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl<UserDataArg>
   if (m_pipelineState->enableSwXfb()) {
     unsigned *controlBufPtr = nullptr;
 
-    switch (m_shaderStage) {
+    switch (m_shaderStage.value()) {
     case ShaderStage::Vertex:
       controlBufPtr = &intfData->entryArgIdxs.vs.streamOutData.controlBufPtr;
       break;
@@ -1980,9 +1977,9 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl<UserDataArg>
 //                                of user data arguments
 // @param specialUserDataArgs : list of suffix "system value" user data arguments
 // @param builder : IRBuilder to get types from
-void PatchEntryPointMutate::finalizeUserDataArgs(SmallVectorImpl<UserDataArg> &userDataArgs,
-                                                 ArrayRef<UserDataArg> specialUserDataArgs, IRBuilder<> &builder) {
-  auto userDataUsage = getUserDataUsage(m_shaderStage);
+void MutateEntryPoint::finalizeUserDataArgs(SmallVectorImpl<UserDataArg> &userDataArgs,
+                                            ArrayRef<UserDataArg> specialUserDataArgs, IRBuilder<> &builder) {
+  auto userDataUsage = getUserDataUsage(m_shaderStage.value());
 
   // In compute-with-calls, we need to ensure that the compute shader and library code agree that s15 is the spill
   // table pointer, even if it is not needed, because library code does not know whether a spill table pointer is
@@ -2120,9 +2117,8 @@ void PatchEntryPointMutate::finalizeUserDataArgs(SmallVectorImpl<UserDataArg> &u
 // Get UserDataUsage struct for the merged shader stage that contains the given shader stage
 //
 // @param stage : Shader stage
-PatchEntryPointMutate::UserDataUsage *PatchEntryPointMutate::getUserDataUsage(ShaderStageEnum stage) {
+MutateEntryPoint::UserDataUsage *MutateEntryPoint::getUserDataUsage(ShaderStageEnum stage) {
   stage = getMergedShaderStage(stage);
-  m_userDataUsage.resize(std::max(m_userDataUsage.size(), static_cast<size_t>(stage) + 1));
   if (!m_userDataUsage[stage])
     m_userDataUsage[stage] = std::make_unique<UserDataUsage>();
   return &*m_userDataUsage[stage];
@@ -2136,7 +2132,7 @@ PatchEntryPointMutate::UserDataUsage *PatchEntryPointMutate::getUserDataUsage(Sh
 // TES -> GS (if it exists)
 //
 // @param stage : Shader stage
-ShaderStageEnum PatchEntryPointMutate::getMergedShaderStage(ShaderStageEnum stage) const {
+ShaderStageEnum MutateEntryPoint::getMergedShaderStage(ShaderStageEnum stage) const {
   switch (stage) {
   case ShaderStage::Vertex:
     if (m_pipelineState->hasShaderStage(ShaderStage::TessControl))
@@ -2153,18 +2149,18 @@ ShaderStageEnum PatchEntryPointMutate::getMergedShaderStage(ShaderStageEnum stag
 }
 
 // =====================================================================================================================
-bool PatchEntryPointMutate::isComputeWithCalls() const {
+bool MutateEntryPoint::isComputeWithCalls() const {
   return m_computeWithCalls;
 }
 
 // =====================================================================================================================
-bool PatchEntryPointMutate::UserDataUsage::isSpecialUserDataUsed(UserDataMapping kind) {
+bool MutateEntryPoint::UserDataUsage::isSpecialUserDataUsed(UserDataMapping kind) {
   unsigned index = static_cast<unsigned>(kind) - static_cast<unsigned>(UserDataMapping::GlobalTable);
   return specialUserData.size() > index && !specialUserData[index].users.empty();
 }
 
 // =====================================================================================================================
-void PatchEntryPointMutate::UserDataUsage::addLoad(unsigned dwordOffset, unsigned dwordSize) {
+void MutateEntryPoint::UserDataUsage::addLoad(unsigned dwordOffset, unsigned dwordSize) {
   assert(dwordOffset + dwordSize <= 256 && "shader uses a user data region that is too large");
 
   if (dwordOffset + dwordSize > loadSizes.size())
diff --git a/lgc/patch/NggPrimShader.cpp b/lgc/patch/NggPrimShader.cpp
index a1e25e904d..73207cd6b0 100644
--- a/lgc/patch/NggPrimShader.cpp
+++ b/lgc/patch/NggPrimShader.cpp
@@ -246,9 +246,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
 
     // ES-GS ring
     if (ldsLayout) {
-      // NOTE: We round ES-GS LDS size to 4-dword alignment. This is for later LDS read/write operations of mutilple
-      // dwords (such as DS128).
-      ldsRegionSize = alignTo(calcFactor.esGsLdsSize, 4U);
+      ldsRegionSize = calcFactor.esGsLdsSize;
 
       printLdsRegionInfo("ES-GS Ring", ldsOffset, ldsRegionSize);
       (*ldsLayout)[PrimShaderLdsRegion::EsGsRing] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -7049,7 +7047,6 @@ void NggPrimShader::prepareSwXfb(ArrayRef<Value *> primCountInSubgroup) {
       }
 
       Value *dwordsWritten[MaxTransformFeedbackBuffers] = {};
-      Value *dwordsPerPrim[MaxTransformFeedbackBuffers] = {};
 
       // Calculate numPrimsToWrite
       for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
@@ -7090,20 +7087,14 @@ void NggPrimShader::prepareSwXfb(ArrayRef<Value *> primCountInSubgroup) {
         dwordsRemaining = m_builder.CreateIntrinsic(Intrinsic::smax, dwordsRemaining->getType(),
                                                     {dwordsRemaining, m_builder.getInt32(0)});
         // numPrimsToWrite = min(dwordsRemaining / dwordsPerPrim, numPrimsToWrite)
-        dwordsPerPrim[i] =
+        Value *dwordsPerPrim =
             m_builder.CreateMul(m_verticesPerPrimitive, m_builder.getInt32(xfbStrides[i] / sizeof(unsigned)));
-        Value *primsCanWrite = m_builder.CreateUDiv(dwordsRemaining, dwordsPerPrim[i]);
+        Value *primsCanWrite = m_builder.CreateUDiv(dwordsRemaining, dwordsPerPrim);
         numPrimsToWrite[xfbBufferToStream[i]] =
             m_builder.CreateIntrinsic(Intrinsic::umin, numPrimsToWrite[xfbBufferToStream[i]]->getType(),
                                       {numPrimsToWrite[xfbBufferToStream[i]], primsCanWrite});
-      }
-
-      // Increment dwordsWritten
-      for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
-        if (!bufferActive[i])
-          continue;
 
-        Value *dwordsToWrite = m_builder.CreateMul(numPrimsToWrite[xfbBufferToStream[i]], dwordsPerPrim[i]);
+        Value *dwordsToWrite = m_builder.CreateMul(numPrimsToWrite[xfbBufferToStream[i]], dwordsPerPrim);
 
         if (i == lastActiveBuffer) {
           // ds_ordered_count, wave done
diff --git a/lgc/patch/PassRegistry.inc b/lgc/patch/PassRegistry.inc
index c2018e9f34..322336a266 100644
--- a/lgc/patch/PassRegistry.inc
+++ b/lgc/patch/PassRegistry.inc
@@ -57,18 +57,18 @@ LLPC_MODULE_PASS("lgc-builder-replayer", BuilderReplayer)
 LLPC_MODULE_PASS("lgc-continufy", Continufy)
 LLPC_MODULE_PASS("lgc-patch-resource-collect", PatchResourceCollect)
 LLPC_MODULE_PASS("lgc-patch-initialize-workgroup-memory", PatchInitializeWorkgroupMemory)
-LLPC_MODULE_PASS("lgc-patch-image-derivatives", PatchImageDerivatives)
+LLPC_MODULE_PASS("lgc-lower-image-derivatives", LowerImageDerivatives)
 LLPC_MODULE_PASS("lgc-patch-in-out-import-export", PatchInOutImportExport)
-LLPC_FUNCTION_PASS("lgc-patch-invariant-loads", PatchInvariantLoads)
+LLPC_FUNCTION_PASS("lgc-lower-invariant-loads", LowerInvariantLoads)
 LLPC_MODULE_PASS("lgc-patch-setup-target-features", PatchSetupTargetFeatures)
-LLPC_MODULE_PASS("lgc-patch-copy-shader", PatchCopyShader)
+LLPC_MODULE_PASS("lgc-generate-copy-shader", GenerateCopyShader)
 LLPC_MODULE_PASS("lgc-patch-prepare-pipeline-abi", PatchPreparePipelineAbi)
 LLPC_FUNCTION_PASS("lgc-patch-read-first-lane", PatchReadFirstLane)
 LLPC_MODULE_PASS("lgc-patch-llvm-ir-inclusion", PatchLlvmIrInclusion)
 LLPC_FUNCTION_PASS("lgc-patch-peephole-opt", PatchPeepholeOpt)
 LLPC_MODULE_PASS("lgc-lower-subgroup-ops", LowerSubgroupOps)
-LLPC_MODULE_PASS("lgc-patch-entry-point-mutate", PatchEntryPointMutate)
-LLPC_MODULE_PASS("lgc-patch-check-shader-cache", PatchCheckShaderCache)
+LLPC_MODULE_PASS("lgc-mutate-entry-point", MutateEntryPoint)
+LLPC_MODULE_PASS("lgc-patch-check-shader-cache", CheckShaderCache)
 LLPC_LOOP_PASS("lgc-patch-loop-metadata", PatchLoopMetadata)
 LLPC_FUNCTION_PASS("lgc-patch-buffer-op", PatchBufferOp)
 LLPC_MODULE_PASS("lgc-patch-workarounds", PatchWorkarounds)
diff --git a/lgc/patch/Patch.cpp b/lgc/patch/Patch.cpp
index fbb01c5c57..bac8f27977 100644
--- a/lgc/patch/Patch.cpp
+++ b/lgc/patch/Patch.cpp
@@ -37,33 +37,34 @@
 #include "lgc/PassManager.h"
 #include "lgc/Pipeline.h"
 #include "lgc/builder/BuilderReplayer.h"
+#include "lgc/patch/AddLoopMetadata.h"
+#include "lgc/patch/CheckShaderCache.h"
+#include "lgc/patch/CollectImageOperations.h"
 #include "lgc/patch/Continufy.h"
 #include "lgc/patch/FragColorExport.h"
+#include "lgc/patch/GenerateCopyShader.h"
+#include "lgc/patch/IncludeLlvmIr.h"
 #include "lgc/patch/LowerDebugPrintf.h"
 #include "lgc/patch/LowerDesc.h"
 #include "lgc/patch/LowerGpuRt.h"
+#include "lgc/patch/LowerImageDerivatives.h"
+#include "lgc/patch/LowerInOut.h"
+#include "lgc/patch/LowerInvariantLoads.h"
+#include "lgc/patch/LowerMulDx9Zero.h"
 #include "lgc/patch/LowerSubgroupOps.h"
+#include "lgc/patch/MutateEntryPoint.h"
 #include "lgc/patch/PatchBufferOp.h"
-#include "lgc/patch/PatchCheckShaderCache.h"
-#include "lgc/patch/PatchCopyShader.h"
-#include "lgc/patch/PatchEntryPointMutate.h"
-#include "lgc/patch/PatchImageDerivatives.h"
-#include "lgc/patch/PatchImageOpCollect.h"
-#include "lgc/patch/PatchInOutImportExport.h"
 #include "lgc/patch/PatchInitializeWorkgroupMemory.h"
-#include "lgc/patch/PatchInvariantLoads.h"
-#include "lgc/patch/PatchLlvmIrInclusion.h"
-#include "lgc/patch/PatchLoadScalarizer.h"
-#include "lgc/patch/PatchLoopMetadata.h"
-#include "lgc/patch/PatchMulDx9Zero.h"
 #include "lgc/patch/PatchPeepholeOpt.h"
 #include "lgc/patch/PatchPreparePipelineAbi.h"
 #include "lgc/patch/PatchReadFirstLane.h"
 #include "lgc/patch/PatchResourceCollect.h"
 #include "lgc/patch/PatchSetupTargetFeatures.h"
 #include "lgc/patch/PatchWorkarounds.h"
+#include "lgc/patch/ScalarizeLoads.h"
 #include "lgc/patch/TcsPassthroughShader.h"
 #include "lgc/patch/VertexFetch.h"
+
 #if LLPC_BUILD_STRIX1
 #include "lgc/patch/WorkaroundDsSubdwordWrite.h"
 #endif
@@ -200,23 +201,23 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
   passMgr.addPass(PatchNullFragShader());
   passMgr.addPass(PatchResourceCollect()); // also removes inactive/unused resources
 
-  // PatchCheckShaderCache depends on PatchResourceCollect
-  passMgr.addPass(PatchCheckShaderCache(std::move(checkShaderCacheFunc)));
+  // CheckShaderCache depends on PatchResourceCollect
+  passMgr.addPass(CheckShaderCache(std::move(checkShaderCacheFunc)));
 
   // First part of lowering to "AMDGCN-style"
   passMgr.addPass(PatchWorkarounds());
-  passMgr.addPass(PatchCopyShader());
+  passMgr.addPass(GenerateCopyShader());
   passMgr.addPass(LowerVertexFetch());
   passMgr.addPass(LowerFragColorExport());
   passMgr.addPass(LowerDebugPrintf());
   passMgr.addPass(LowerDesc());
-  passMgr.addPass(PatchEntryPointMutate());
+  passMgr.addPass(MutateEntryPoint());
   passMgr.addPass(createModuleToFunctionPassAdaptor(LowerPopsInterlock()));
   passMgr.addPass(PatchInitializeWorkgroupMemory());
   passMgr.addPass(PatchInOutImportExport());
 
   // Patch invariant load and loop metadata.
-  passMgr.addPass(createModuleToFunctionPassAdaptor(PatchInvariantLoads()));
+  passMgr.addPass(createModuleToFunctionPassAdaptor(LowerInvariantLoads()));
   passMgr.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(PatchLoopMetadata())));
 
 #if LLPC_BUILD_STRIX1
@@ -276,7 +277,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
     passMgr.addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
   }
 
-  passMgr.addPass(PatchImageDerivatives());
+  passMgr.addPass(LowerImageDerivatives());
 
   // Set up target features in shader entry-points.
   // NOTE: Needs to be done after post-NGG function inlining, because LLVM refuses to inline something
@@ -486,7 +487,7 @@ void Patch::addOptimizationPasses(lgc::PassManager &passMgr, uint32_t optLevel)
 void Patch::init(Module *module) {
   m_module = module;
   m_context = &m_module->getContext();
-  m_shaderStage = ShaderStage::Invalid;
+  m_shaderStage = std::nullopt;
   m_entryPoint = nullptr;
 }
 
diff --git a/lgc/patch/PatchBufferOp.cpp b/lgc/patch/PatchBufferOp.cpp
index c6948a2258..d5b7c1dac2 100644
--- a/lgc/patch/PatchBufferOp.cpp
+++ b/lgc/patch/PatchBufferOp.cpp
@@ -775,11 +775,14 @@ void BufferOpLowering::visitStridedBufferAddrAndStrideToPtr(StridedBufferAddrAnd
 // @param loadDescToPtr : The instruction
 void BufferOpLowering::visitBufferLoadDescToPtr(BufferLoadDescToPtrOp &loadDescToPtr) {
   m_builder.SetInsertPoint(&loadDescToPtr);
-  Value *descriptor =
-      createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact());
-
-  if (loadDescToPtr.getIsCompact())
-    descriptor = createCompactDesc(descriptor, nullptr);
+  bool needLoadDesc = true;
+  Value *descriptor = loadDescToPtr.getDescPtr();
+  if (needLoadDesc) {
+    descriptor =
+        createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact());
+    if (loadDescToPtr.getIsCompact())
+      descriptor = createCompactDesc(descriptor, nullptr);
+  }
 
   m_typeLowering.replaceInstruction(&loadDescToPtr, {descriptor, ConstantPointerNull::get(m_offsetType)});
 
@@ -804,11 +807,15 @@ void BufferOpLowering::visitBufferLoadDescToPtr(BufferLoadDescToPtrOp &loadDescT
 // @param loadDescToPtr : The instruction
 void BufferOpLowering::visitStridedBufferLoadDescToPtr(StridedBufferLoadDescToPtrOp &loadDescToPtr) {
   m_builder.SetInsertPoint(&loadDescToPtr);
-  Value *descriptor =
-      createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact());
-
-  if (loadDescToPtr.getIsCompact())
-    descriptor = createCompactDesc(descriptor, loadDescToPtr.getStride());
+  bool needLoadDesc = true;
+  Value *descriptor = loadDescToPtr.getDescPtr();
+  if (needLoadDesc) {
+    descriptor =
+        createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact());
+
+    if (loadDescToPtr.getIsCompact())
+      descriptor = createCompactDesc(descriptor, loadDescToPtr.getStride());
+  }
 
   m_typeLowering.replaceInstruction(&loadDescToPtr,
                                     {descriptor, ConstantPointerNull::get(m_offsetType), m_builder.getInt32(0)});
@@ -1486,6 +1493,7 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
 
   auto pointerValues = m_typeLowering.getValue(pointerOperand);
   Value *const bufferDesc = pointerValues[0];
+  const bool isIndexedDesc = isa<PointerType>(bufferDesc->getType());
 
   const DataLayout &dataLayout = m_builder.GetInsertBlock()->getModule()->getDataLayout();
 
@@ -1502,9 +1510,10 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
   const bool isDlc = isGlc; // For buffer load on GFX10+, we set DLC = GLC
 
   Value *const baseIndex = m_builder.CreatePtrToInt(pointerValues[1], m_builder.getInt32Ty());
+  const bool isDivergentDesc = getDescriptorInfo(bufferDesc).divergent.value();
 
-  // If our buffer descriptor is divergent, need to handle that differently.
-  if (getDescriptorInfo(bufferDesc).divergent.value()) {
+  if (!isIndexedDesc && isDivergentDesc) {
+    // If our buffer descriptor is divergent, need to handle that differently in non resource indexing mode.
     auto createLoadStoreFunc = [&](Value *pointer) {
       Value *result = nullptr;
       if (isLoad) {
@@ -1588,6 +1597,14 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
     }
   }
 
+  auto getBufferDesc = [&]() -> Value * {
+    if (isIndexedDesc) {
+      auto address = m_builder.CreatePtrToInt(bufferDesc, m_builder.getInt64Ty());
+      return m_builder.CreateTrunc(address, m_builder.getInt32Ty());
+    }
+    return bufferDesc;
+  };
+
   // The index in storeValue which we use next
   unsigned storeIndex = 0;
 
@@ -1635,49 +1652,51 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
     }
 
     if (isLoad) {
+      bool accessSizeAllowed = true;
       if (m_pipelineState.getTargetInfo().getGfxIpVersion().major <= 11) {
         // TODO For stores?
         coherent.bits.dlc = isDlc;
+        accessSizeAllowed = accessSize >= 4;
       }
-      if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) {
-        Value *indexValue = pointerValues[2];
-        CallInst *call = nullptr;
-        // Especially when the index is a constant, and the stride is known at compile-time,
-        // we should create s_buffer_load instructions with constant offsets: index * stride + offset
-        if ((isInvariant && accessSize >= 4) && isa<ConstantInt>(indexValue)) {
-          Value *desc1 = m_builder.CreateExtractElement(bufferDesc, 1);
+
+      Value *indexValue = pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER
+                              ? pointerValues[2]
+                              : nullptr;
+      if (isInvariant && !isDivergentDesc && accessSizeAllowed) {
+        // create s.buffer.load
+        Value *desc = bufferDesc;
+        if (isIndexedDesc)
+          desc = m_builder.CreateLoad(FixedVectorType::get(m_builder.getInt32Ty(), 4), bufferDesc);
+        if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) {
+          // Especially when the index is a constant, and the stride is known at compile-time,
+          // we should create s_buffer_load instructions with constant offsets: index * stride + offset
+          assert(isa<ConstantInt>(indexValue));
+          Value *desc1 = m_builder.CreateExtractElement(desc, 1);
           // stride is 61:48 bits in descriptor, which will always be constantInt when create BufferDesc
           Value *stride =
               m_builder.CreateAnd(m_builder.CreateLShr(desc1, m_builder.getInt32(16)), m_builder.getInt32(0x3fff));
           Value *indexOffsetVal = m_builder.CreateMul(indexValue, stride);
           offsetVal = m_builder.CreateAdd(offsetVal, indexOffsetVal);
-          call = m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_buffer_load, intAccessType,
-                                           {bufferDesc, offsetVal, m_builder.getInt32(coherent.u32All)});
-        } else {
-          call = m_builder.CreateIntrinsic(
-              Intrinsic::amdgcn_struct_buffer_load, intAccessType,
-              {bufferDesc, indexValue, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
         }
-        copyMetadata(call, &inst);
-        if (isInvariant)
-          call->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {}));
-        part = call;
-      } else if (isInvariant && accessSize >= 4) {
+
         CallInst *call = m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_buffer_load, intAccessType,
-                                                   {bufferDesc, offsetVal, m_builder.getInt32(coherent.u32All)});
+                                                   {desc, offsetVal, m_builder.getInt32(coherent.u32All)});
         call->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {}));
+        copyMetadata(call, &inst);
         part = call;
       } else {
-        unsigned intrinsicID = Intrinsic::amdgcn_raw_buffer_load;
-#if !defined(LLVM_HAVE_BRANCH_AMD_GFX)
-#warning[!amd-gfx] Atomic load loses memory semantics
-#else
-        if (ordering != AtomicOrdering::NotAtomic)
-          intrinsicID = Intrinsic::amdgcn_raw_atomic_buffer_load;
-#endif
-        part = m_builder.CreateIntrinsic(
-            intrinsicID, intAccessType,
-            {bufferDesc, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+        if (indexValue) {
+          part = m_builder.CreateIntrinsic(
+              Intrinsic::amdgcn_struct_buffer_load, intAccessType,
+              {getBufferDesc(), indexValue, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+        } else {
+          unsigned intrinsicID = Intrinsic::amdgcn_raw_buffer_load;
+          if (ordering != AtomicOrdering::NotAtomic)
+            intrinsicID = Intrinsic::amdgcn_raw_atomic_buffer_load;
+          part = m_builder.CreateIntrinsic(
+              intrinsicID, intAccessType,
+              {getBufferDesc(), offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+        }
       }
     } else {
       // Store
@@ -1692,12 +1711,12 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
       copyMetadata(part, &inst);
       if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) {
         part = m_builder.CreateIntrinsic(Intrinsic::amdgcn_struct_buffer_store, intAccessType,
-                                         {part, bufferDesc, pointerValues[2], offsetVal, m_builder.getInt32(0),
+                                         {part, getBufferDesc(), pointerValues[2], offsetVal, m_builder.getInt32(0),
                                           m_builder.getInt32(coherent.u32All)});
       } else {
         part = m_builder.CreateIntrinsic(
             Intrinsic::amdgcn_raw_buffer_store, intAccessType,
-            {part, bufferDesc, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+            {part, getBufferDesc(), offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
       }
     }
 
diff --git a/lgc/patch/PatchInitializeWorkgroupMemory.cpp b/lgc/patch/PatchInitializeWorkgroupMemory.cpp
index fd81de41e9..1d5cc9ce8f 100644
--- a/lgc/patch/PatchInitializeWorkgroupMemory.cpp
+++ b/lgc/patch/PatchInitializeWorkgroupMemory.cpp
@@ -79,7 +79,7 @@ PreservedAnalyses PatchInitializeWorkgroupMemory::run(Module &module, ModuleAnal
 
   Patch::init(&module);
   m_shaderStage = ShaderStage::Compute;
-  m_entryPoint = pipelineShaders.getEntryPoint(static_cast<ShaderStageEnum>(m_shaderStage));
+  m_entryPoint = pipelineShaders.getEntryPoint(m_shaderStage.value());
   BuilderBase builder(*m_context);
   builder.SetInsertPointPastAllocas(m_entryPoint);
 
@@ -133,7 +133,7 @@ void PatchInitializeWorkgroupMemory::initializeWithZero(GlobalVariable *lds, Bui
   builder.SetInsertPoint(originBlock->getTerminator());
   // Get thread info
   auto &shaderMode = m_pipelineState->getShaderModes()->getComputeShaderMode();
-  const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage)->entryArgIdxs;
+  const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs;
   Value *localInvocationId = getFunctionArgument(m_entryPoint, entryArgIdxs.cs.localInvocationId);
   const unsigned actualNumThreads = shaderMode.workgroupSizeX * shaderMode.workgroupSizeY * shaderMode.workgroupSizeZ;
 
diff --git a/lgc/patch/PatchResourceCollect.cpp b/lgc/patch/PatchResourceCollect.cpp
index 6ba2a49442..85d8ec0f06 100644
--- a/lgc/patch/PatchResourceCollect.cpp
+++ b/lgc/patch/PatchResourceCollect.cpp
@@ -41,6 +41,7 @@
 #include "lgc/util/BuilderBase.h"
 #include "lgc/util/Debug.h"
 #include "llvm-dialects/Dialect/Visitor.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -94,8 +95,7 @@ PreservedAnalyses PatchResourceCollect::run(Module &module, ModuleAnalysisManage
   m_tcsInputHasDynamicIndexing = false;
 
   bool needPack = false;
-  for (int shaderStage = 0; shaderStage < ShaderStage::GfxCount; ++shaderStage) {
-    ShaderStageEnum stage = static_cast<ShaderStageEnum>(shaderStage);
+  for (auto stage : ShaderStagesGraphics) {
     if (pipelineState->hasShaderStage(stage) &&
         (pipelineState->canPackInput(stage) || pipelineState->canPackOutput(stage))) {
       needPack = true;
@@ -109,9 +109,9 @@ PreservedAnalyses PatchResourceCollect::run(Module &module, ModuleAnalysisManage
   }
 
   // Process each shader stage, in reverse order. We process FS even if it does not exist (part-pipeline compile).
-  for (int shaderStage = ShaderStage::CountInternal - 1; shaderStage >= 0; --shaderStage) {
-    m_entryPoint = pipelineShaders.getEntryPoint(static_cast<ShaderStageEnum>(shaderStage));
-    m_shaderStage = static_cast<ShaderStageEnum>(shaderStage);
+  for (auto shaderStage : llvm::reverse(ShaderStagesNativeCopy)) {
+    m_entryPoint = pipelineShaders.getEntryPoint(shaderStage);
+    m_shaderStage = shaderStage;
     if (m_entryPoint)
       processShader();
     else if (m_shaderStage == ShaderStage::Fragment)
@@ -538,7 +538,9 @@ bool PatchResourceCollect::checkGsOnChipValidity() {
     // NOTE: Make gsVsVertexItemSize odd by "| 1", to optimize GS -> VS ring layout for LDS bank conflicts.
     unsigned gsVsVertexItemTotalSize = 0;
     for (int i = 0; i < MaxGsStreams; ++i) {
-      gsVsVertexItemSize[i] = (4 * gsResUsage->inOutUsage.gs.outLocCount[i]) | 1;
+      gsVsVertexItemSize[i] = 4 * gsResUsage->inOutUsage.gs.outLocCount[i];
+      if (gsVsVertexItemSize[i] != 0)
+        gsVsVertexItemSize[i] |= 1; // If vertex item size is 0, this stream is inactive without any export.
       gsVsVertexItemTotalSize += gsVsVertexItemSize[i];
     }
 
@@ -631,6 +633,9 @@ bool PatchResourceCollect::checkGsOnChipValidity() {
       assert(gsInstanceCount == 1);
     }
 
+    // The minimum number of esVertsPerSubgroup must be at least the number of vertices per primitive.
+    esVertsPerSubgroup = std::max(inVertsPerPrim, esVertsPerSubgroup);
+
     // NOTE: If ray query uses LDS stack, the expected max thread count in the group is 64. And we force wave size
     // to be 64 in order to keep all threads in the same wave. In the future, we could consider to get rid of this
     // restriction by providing the capability of querying thread ID in the group rather than in wave.
@@ -952,69 +957,133 @@ bool PatchResourceCollect::checkGsOnChipValidity() {
   }
 
   LLPC_OUTS("===============================================================================\n");
-  LLPC_OUTS("// LLPC geometry calculation factor results\n\n");
-  LLPC_OUTS("ES vertices per subgroup: " << gsResUsage->inOutUsage.gs.calcFactor.esVertsPerSubgroup << "\n");
-  LLPC_OUTS("GS primitives per subgroup: " << gsResUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup << "\n");
+  LLPC_OUTS("// LLPC HW GS configurations\n\n");
+  LLPC_OUTS("HW GS = ");
+  if (meshPipeline) {
+    LLPC_OUTS("Mesh shader\n");
+  } else if (m_pipelineState->getNggControl()->enableNgg) {
+    LLPC_OUTS((hasGs ? "NGG GS" : "NGG") << "\n");
+  } else {
+    LLPC_OUTS("Legacy GS (" << (gsOnChip ? "Onchip" : "Offchip") << ")\n");
+  }
   LLPC_OUTS("\n");
-  LLPC_OUTS("ES-GS LDS size (in dwords): " << gsResUsage->inOutUsage.gs.calcFactor.esGsLdsSize << "\n");
-  LLPC_OUTS("On-chip GS LDS size (in dwords): " << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << "\n");
+
+  LLPC_OUTS("EsVerts = " << gsResUsage->inOutUsage.gs.calcFactor.esVertsPerSubgroup << " verts/subgroup\n");
+  LLPC_OUTS("GsPrims = " << gsResUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup << " prims/subgroup\n");
+  LLPC_OUTS("\n");
+
+  LLPC_OUTS("EsGsLdsSize = " << gsResUsage->inOutUsage.gs.calcFactor.esGsLdsSize << " dwords\n");
+  LLPC_OUTS("GsOnchipLdsSize = " << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << " dwords\n");
+  if (gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize > 0) {
+    LLPC_OUTS("RayQueryLdsStack = " << gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize << " dwords (Start = "
+                                    << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << ")\n");
+  }
   LLPC_OUTS("\n");
-  LLPC_OUTS("ES-GS ring item size (in dwords): " << gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize << "\n");
-  LLPC_OUTS("GS-VS ring item size (in dwords): " << gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize << "\n");
+
+  LLPC_OUTS("EsGsRingItemSize = " << gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize << " dwords\n");
+  LLPC_OUTS("GsVsRingItemSize = " << gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize << " dwords\n");
+  LLPC_OUTS("GsVsVertexItemSizes = [");
+  for (unsigned i = 0; i < MaxGsStreams; ++i) {
+    LLPC_OUTS(gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i]);
+    LLPC_OUTS((i == MaxGsStreams - 1 ? "" : ", "));
+  }
+  LLPC_OUTS("] dwords\n");
   LLPC_OUTS("\n");
 
+  if (meshPipeline || m_pipelineState->getNggControl()->enableNgg) {
+    LLPC_OUTS("PrimAmpFactor = " << gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor << "\n");
+    LLPC_OUTS("EnableMaxVertOut = " << (gsResUsage->inOutUsage.gs.calcFactor.enableMaxVertOut ? "true" : "false")
+                                    << "\n");
+    LLPC_OUTS("\n");
+  }
+
   if (hasGs) {
-    LLPC_OUTS("GS stream item sizes (in dwords):\n");
+    LLPC_OUTS("InputPrimitive = ");
+    switch (geometryMode.inputPrimitive) {
+    case InputPrimitives::Points:
+      LLPC_OUTS("Points\n");
+      break;
+    case InputPrimitives::Lines:
+      LLPC_OUTS("Lines\n");
+      break;
+    case InputPrimitives::LinesAdjacency:
+      LLPC_OUTS("LinesAdjacency\n");
+      break;
+    case InputPrimitives::Triangles:
+      LLPC_OUTS("Triangles\n");
+      break;
+    case InputPrimitives::TrianglesAdjacency:
+      LLPC_OUTS("TrianglesAdjacency\n");
+      break;
+    case InputPrimitives::Patch:
+      LLPC_OUTS("Patch (ControlPoints = " << geometryMode.controlPoints << ")\n");
+      break;
+    default:
+      break;
+    }
+    LLPC_OUTS("OutputPrimitive = ");
+    switch (geometryMode.outputPrimitive) {
+    case OutputPrimitives::Points:
+      LLPC_OUTS("Points\n");
+      break;
+    case OutputPrimitives::LineStrip:
+      LLPC_OUTS("LineStrip\n");
+      break;
+    case OutputPrimitives::TriangleStrip:
+      LLPC_OUTS("TriangleStrip\n");
+      break;
+    default:
+      break;
+    }
+    LLPC_OUTS("Invocations = " << geometryMode.invocations << "\n");
+    LLPC_OUTS("MaxOutputVertices = " << geometryMode.outputVertices << "\n");
+    LLPC_OUTS("RobustGsEmits = " << (geometryMode.robustGsEmits ? "true" : "false") << "\n");
+    LLPC_OUTS("\n");
+
+    const unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream;
+    LLPC_OUTS("RasterStream = ");
+    if (rasterStream != InvalidValue)
+      LLPC_OUTS("Stream[" << rasterStream << "]\n");
+    else
+      LLPC_OUTS("NoRasterization\n");
+
+    const auto &streamXfbBuffers = m_pipelineState->getStreamXfbBuffers();
     for (unsigned i = 0; i < MaxGsStreams; ++i) {
       unsigned streamItemSize =
           gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] * geometryMode.outputVertices;
-      LLPC_OUTS("    stream[" << i << "] = " << streamItemSize);
-
+      LLPC_OUTS("Stream[" << i << "] = " << streamItemSize << " dwords");
+      if (streamItemSize == 0)
+        LLPC_OUTS(" (Inactive)");
+      LLPC_OUTS("  =>  ");
       if (m_pipelineState->enableXfb()) {
-        const auto &streamXfbBuffers = m_pipelineState->getStreamXfbBuffers();
-        LLPC_OUTS(", XFB buffers = { ");
         if (streamXfbBuffers[i] != 0) {
+          LLPC_OUTS("XfbBuffer[");
+          bool printFirstXfbBuffer = true;
           for (unsigned j = 0; j < MaxTransformFeedbackBuffers; ++j) {
-            if ((streamXfbBuffers[i] & (1 << j)) != 0)
-              LLPC_OUTS(j << " ");
+            if ((streamXfbBuffers[i] & (1 << j)) != 0) {
+              LLPC_OUTS((printFirstXfbBuffer ? "" : ", ") << j << "");
+              printFirstXfbBuffer = false;
+            }
           }
+          LLPC_OUTS("]");
+        } else {
+          LLPC_OUTS("NoXfb");
         }
-        LLPC_OUTS("}");
+      } else {
+        LLPC_OUTS("NoXfb");
       }
-
       LLPC_OUTS("\n");
     }
     LLPC_OUTS("\n");
   }
 
-  if (gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize > 0) {
-    LLPC_OUTS("Ray query LDS stack size (in dwords): "
-              << gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize
-              << " (start = " << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << ")\n\n");
-  }
-
-  if (meshPipeline) {
-    LLPC_OUTS("GS primitive amplification factor: " << gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor << "\n");
-    LLPC_OUTS("\n");
-    LLPC_OUTS("GS is on-chip (Mesh)\n");
-  } else if (m_pipelineState->getNggControl()->enableNgg) {
-    LLPC_OUTS("GS primitive amplifier: " << gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor << "\n");
-    LLPC_OUTS("GS enable max output vertices: "
-              << (gsResUsage->inOutUsage.gs.calcFactor.enableMaxVertOut ? "true" : "false") << "\n");
-    LLPC_OUTS("\n");
-    LLPC_OUTS("GS is on-chip (NGG)\n");
-  } else {
-    LLPC_OUTS("GS is " << (gsOnChip ? "on-chip" : "off-chip") << "\n");
-  }
-  LLPC_OUTS("\n");
-
   return gsOnChip;
 }
 
 // =====================================================================================================================
 // Process a single shader.
 void PatchResourceCollect::processShader() {
-  m_resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage);
+  m_resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value());
 
   // Invoke handling of "call" instruction
   visit(m_entryPoint);
@@ -1081,7 +1150,7 @@ void PatchResourceCollect::processMissingFs() {
   assert(m_shaderStage == ShaderStage::Fragment);
   if (!m_processMissingFs)
     return;
-  m_resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage);
+  m_resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value());
 
   FsInputMappings fsInputMappings = {};
   m_pipelineState->getPalMetadata()->retrieveFragmentInputInfo(fsInputMappings);
@@ -1226,7 +1295,7 @@ void PatchResourceCollect::visitCallInst(CallInst &callInst) {
       // Collect transform feedback export calls, used in SW-emulated stream-out. For GS, the collecting will
       // be done when we generate copy shader since GS is primitive-based.
       if (m_shaderStage != ShaderStage::Geometry) {
-        auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
+        auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
         // A transform feedback export call is expected to be <4 x dword> at most
         inOutUsage.xfbExpCount += outputValue->getType()->getPrimitiveSizeInBits() > 128 ? 2 : 1;
       }
@@ -1506,11 +1575,11 @@ void PatchResourceCollect::matchGenericInOut() {
   assert(m_pipelineState->isGraphics());
 
   // Do input matching and location remapping
-  bool packInput = m_pipelineState->canPackInput(m_shaderStage);
+  bool packInput = m_pipelineState->canPackInput(m_shaderStage.value());
   if (m_shaderStage == ShaderStage::TessControl && m_tcsInputHasDynamicIndexing) {
     packInput = false;
     // Disable to pack VS-TCS
-    m_pipelineState->setPackInput(m_shaderStage, false);
+    m_pipelineState->setPackInput(m_shaderStage.value(), false);
     m_pipelineState->setPackOutput(ShaderStage::Vertex, false);
   }
   if (packInput)
@@ -1519,7 +1588,7 @@ void PatchResourceCollect::matchGenericInOut() {
     updateInputLocInfoMapWithUnpack();
 
   // Do output matching and location remapping
-  bool packOutput = m_pipelineState->canPackOutput(m_shaderStage);
+  bool packOutput = m_pipelineState->canPackOutput(m_shaderStage.value());
   if (m_shaderStage == ShaderStage::Vertex && m_tcsInputHasDynamicIndexing)
     assert(!packOutput);
   if (packOutput) {
@@ -1535,8 +1604,9 @@ void PatchResourceCollect::matchGenericInOut() {
 
   // Update location count of input/output
   LLPC_OUTS("===============================================================================\n");
-  LLPC_OUTS("// LLPC location input/output mapping results (" << getShaderStageAbbreviation(m_shaderStage) << ")\n\n");
-  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
+  LLPC_OUTS("// LLPC location input/output mapping results (" << getShaderStageAbbreviation(m_shaderStage.value())
+                                                              << ")\n\n");
+  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
   auto &inLocInfoMap = inOutUsage.inputLocInfoMap;
   auto &outLocInfoMap = inOutUsage.outputLocInfoMap;
   auto &perPatchInLocMap = inOutUsage.perPatchInputLocMap;
@@ -1553,8 +1623,8 @@ void PatchResourceCollect::matchGenericInOut() {
       const unsigned newComp = locInfoPair.second.getComponent();
       assert(newLoc != InvalidValue);
       inOutUsage.inputMapLocCount = std::max(inOutUsage.inputMapLocCount, newLoc + 1);
-      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input:  [location, component] = [" << origLoc
-                    << ", " << origComp << "]  =>  Mapped = [" << newLoc << ", " << newComp << "]\n");
+      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input:  [location, component] = ["
+                    << origLoc << ", " << origComp << "]  =>  Mapped = [" << newLoc << ", " << newComp << "]\n");
     }
     LLPC_OUTS("\n");
   }
@@ -1576,13 +1646,13 @@ void PatchResourceCollect::matchGenericInOut() {
                                     inOutUsage.gs.outLocCount[2] + inOutUsage.gs.outLocCount[3];
 
         inOutUsage.outputMapLocCount = std::max(inOutUsage.outputMapLocCount, assignedLocCount);
-        LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: stream = " << streamId << ", "
+        LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output: stream = " << streamId << ", "
                       << " [location, component] = [" << origLoc << ", " << origComp << "]  =>  Mapped = [" << newLoc
                       << ", " << newComp << "]\n");
       } else {
         inOutUsage.outputMapLocCount = std::max(inOutUsage.outputMapLocCount, newLoc + 1);
-        LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: [location, component] = [" << origLoc
-                      << ", " << origComp << "]  =>  Mapped = [" << newLoc << ", " << newComp << "]\n");
+        LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output: [location, component] = ["
+                      << origLoc << ", " << origComp << "]  =>  Mapped = [" << newLoc << ", " << newComp << "]\n");
       }
     }
     LLPC_OUTS("\n");
@@ -1593,8 +1663,8 @@ void PatchResourceCollect::matchGenericInOut() {
     for (auto locMap : perPatchInLocMap) {
       assert(m_shaderStage == ShaderStage::TessEval && locMap.second != InvalidValue);
       inOutUsage.perPatchInputMapLocCount = std::max(inOutUsage.perPatchInputMapLocCount, locMap.second + 1);
-      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input (per-patch):  location = " << locMap.first
-                    << "  =>  Mapped = " << locMap.second << "\n");
+      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-patch):  location = "
+                    << locMap.first << "  =>  Mapped = " << locMap.second << "\n");
     }
     LLPC_OUTS("\n");
   }
@@ -1604,8 +1674,8 @@ void PatchResourceCollect::matchGenericInOut() {
     for (auto locMap : perPatchOutLocMap) {
       assert(m_shaderStage == ShaderStage::TessControl && locMap.second != InvalidValue);
       inOutUsage.perPatchOutputMapLocCount = std::max(inOutUsage.perPatchOutputMapLocCount, locMap.second + 1);
-      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output (per-patch): location = " << locMap.first
-                    << "  =>  Mapped = " << locMap.second << "\n");
+      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-patch): location = "
+                    << locMap.first << "  =>  Mapped = " << locMap.second << "\n");
     }
     LLPC_OUTS("\n");
   }
@@ -1615,7 +1685,7 @@ void PatchResourceCollect::matchGenericInOut() {
     for (auto locMap : perPrimitiveInLocMap) {
       assert(m_shaderStage == ShaderStage::Fragment && locMap.second != InvalidValue);
       inOutUsage.perPrimitiveInputMapLocCount = std::max(inOutUsage.perPrimitiveInputMapLocCount, locMap.second + 1);
-      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input (per-primitive):  location = "
+      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-primitive):  location = "
                     << locMap.first << "  =>  Mapped = " << locMap.second << "\n");
     }
     LLPC_OUTS("\n");
@@ -1626,31 +1696,31 @@ void PatchResourceCollect::matchGenericInOut() {
     for (auto locMap : perPrimitiveOutLocMap) {
       assert(m_shaderStage == ShaderStage::Mesh && locMap.second != InvalidValue);
       inOutUsage.perPrimitiveOutputMapLocCount = std::max(inOutUsage.perPrimitiveOutputMapLocCount, locMap.second + 1);
-      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output (per-primitive): location = "
+      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-primitive): location = "
                     << locMap.first << "  =>  Mapped = " << locMap.second << "\n");
     }
     LLPC_OUTS("\n");
   }
 
   LLPC_OUTS("// LLPC location count results (after input/output matching) \n\n");
-  LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input:  locations = " << inOutUsage.inputMapLocCount
-                << "\n");
-  LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: locations = " << inOutUsage.outputMapLocCount
-                << "\n");
+  LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
+                << ") Input:  locations = " << inOutUsage.inputMapLocCount << "\n");
+  LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
+                << ") Output: locations = " << inOutUsage.outputMapLocCount << "\n");
   if (m_shaderStage == ShaderStage::TessEval) {
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage)
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
                   << ") Input (per-patch):  locations = " << inOutUsage.perPatchInputMapLocCount << "\n");
   }
   if (m_shaderStage == ShaderStage::TessControl) {
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage)
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
                   << ") Output (per-patch): locations = " << inOutUsage.perPatchOutputMapLocCount << "\n");
   }
   if (m_shaderStage == ShaderStage::Fragment) {
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage)
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
                   << ") Input (per-primitive):  locations = " << inOutUsage.perPrimitiveInputMapLocCount << "\n");
   }
   if (m_shaderStage == ShaderStage::Mesh) {
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage)
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
                   << ") Output (per-primitive): locations = " << inOutUsage.perPrimitiveOutputMapLocCount << "\n");
   }
   LLPC_OUTS("\n");
@@ -1663,12 +1733,12 @@ void PatchResourceCollect::matchGenericInOut() {
 void PatchResourceCollect::mapBuiltInToGenericInOut() {
   assert(m_pipelineState->isGraphics());
 
-  const auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage);
+  const auto resUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value());
 
   auto &builtInUsage = resUsage->builtInUsage;
   auto &inOutUsage = resUsage->inOutUsage;
 
-  const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
+  const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value());
   auto nextResUsage = nextStage ? m_pipelineState->getShaderResourceUsage(nextStage.value()) : nullptr;
 
   assert(inOutUsage.builtInInputLocMap.empty()); // Should be empty
@@ -2060,7 +2130,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
       // NOTE: If gl_in[].gl_ClipDistance is used, we have to check the usage of gl_out[].gl_ClipDistance in
       // tessellation control shader. The clip distance is the maximum of the two. We do this to avoid
       // incorrectness of location assignment during builtin-to-generic mapping.
-      const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage);
+      const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage.value());
       if (prevStage == ShaderStage::TessControl) {
         const auto &prevBuiltInUsage = m_pipelineState->getShaderResourceUsage(prevStage.value())->builtInUsage.tcs;
         clipDistanceCount = std::max(clipDistanceCount, prevBuiltInUsage.clipDistance);
@@ -2074,7 +2144,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
     if (builtInUsage.tes.cullDistanceIn > 0) {
       unsigned cullDistanceCount = builtInUsage.tes.cullDistanceIn;
 
-      const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage);
+      const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage.value());
       if (prevStage == ShaderStage::TessControl) {
         const auto &prevBuiltInUsage = m_pipelineState->getShaderResourceUsage(prevStage.value())->builtInUsage.tcs;
         cullDistanceCount = std::max(cullDistanceCount, prevBuiltInUsage.clipDistance);
@@ -2411,7 +2481,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
         std::max(inOutUsage.perPrimitiveOutputMapLocCount, availPerPrimitiveOutMapLoc);
   } else if (m_shaderStage == ShaderStage::Fragment) {
     // FS
-    const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage);
+    const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage.value());
     unsigned availInMapLoc = inOutUsage.inputMapLocCount;
     unsigned availPerPrimitiveInMapLoc = inOutUsage.perPrimitiveInputMapLocCount;
 
@@ -2467,11 +2537,12 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
 
   // Do builtin-to-generic mapping
   LLPC_OUTS("===============================================================================\n");
-  LLPC_OUTS("// LLPC builtin-to-generic mapping results (" << getShaderStageAbbreviation(m_shaderStage) << ")\n\n");
+  LLPC_OUTS("// LLPC builtin-to-generic mapping results (" << getShaderStageAbbreviation(m_shaderStage.value())
+                                                           << ")\n\n");
   for (const auto &builtInMap : inOutUsage.builtInInputLocMap) {
     const BuiltInKind builtInId = static_cast<BuiltInKind>(builtInMap.first);
     const unsigned loc = builtInMap.second;
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input:  builtin = "
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input:  builtin = "
                   << PipelineState::getBuiltInName(builtInId) << "  =>  Mapped = " << loc << "\n");
   }
   if (!inOutUsage.builtInInputLocMap.empty())
@@ -2482,11 +2553,11 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
     const unsigned loc = builtInMap.second;
 
     if (m_shaderStage == ShaderStage::Geometry) {
-      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage)
+      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
                     << ") Output: stream = " << m_pipelineState->getRasterizerState().rasterStream << " , "
                     << "builtin = " << PipelineState::getBuiltInName(builtInId) << "  =>  Mapped = " << loc << "\n");
     } else {
-      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: builtin = "
+      LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output: builtin = "
                     << PipelineState::getBuiltInName(builtInId) << "  =>  Mapped = " << loc << "\n");
     }
   }
@@ -2496,7 +2567,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
   for (const auto &builtInMap : inOutUsage.perPatchBuiltInInputLocMap) {
     const BuiltInKind builtInId = static_cast<BuiltInKind>(builtInMap.first);
     const unsigned loc = builtInMap.second;
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input (per-patch):  builtin = "
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-patch):  builtin = "
                   << PipelineState::getBuiltInName(builtInId) << "  =>  Mapped = " << loc << "\n");
   }
   if (!inOutUsage.perPatchBuiltInInputLocMap.empty())
@@ -2505,7 +2576,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
   for (const auto &builtInMap : inOutUsage.perPatchBuiltInOutputLocMap) {
     const BuiltInKind builtInId = static_cast<BuiltInKind>(builtInMap.first);
     const unsigned loc = builtInMap.second;
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output (per-patch): builtin = "
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-patch): builtin = "
                   << PipelineState::getBuiltInName(builtInId) << "  =>  Mapped = " << loc << "\n");
   }
   if (!inOutUsage.perPatchBuiltInOutputLocMap.empty())
@@ -2514,7 +2585,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
   for (const auto &builtInMap : inOutUsage.perPrimitiveBuiltInInputLocMap) {
     const BuiltInKind builtInId = static_cast<BuiltInKind>(builtInMap.first);
     const unsigned loc = builtInMap.second;
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input (per-primitive):  builtin = "
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Input (per-primitive):  builtin = "
                   << PipelineState::getBuiltInName(builtInId) << "  =>  Mapped = " << loc << "\n");
   }
   if (!inOutUsage.perPrimitiveBuiltInInputLocMap.empty())
@@ -2523,31 +2594,31 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
   for (const auto &builtInMap : inOutUsage.perPrimitiveBuiltInOutputLocMap) {
     const BuiltInKind builtInId = static_cast<BuiltInKind>(builtInMap.first);
     const unsigned loc = builtInMap.second;
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output (per-primitive): builtin = "
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value()) << ") Output (per-primitive): builtin = "
                   << PipelineState::getBuiltInName(builtInId) << "  =>  Mapped = " << loc << "\n");
   }
   if (!inOutUsage.perPrimitiveBuiltInOutputLocMap.empty())
     LLPC_OUTS("\n");
 
   LLPC_OUTS("// LLPC location count results (after builtin-to-generic mapping)\n\n");
-  LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Input:  locations = " << inOutUsage.inputMapLocCount
-                << "\n");
-  LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage) << ") Output: locations = " << inOutUsage.outputMapLocCount
-                << "\n");
+  LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
+                << ") Input:  locations = " << inOutUsage.inputMapLocCount << "\n");
+  LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
+                << ") Output: locations = " << inOutUsage.outputMapLocCount << "\n");
   if (m_shaderStage == ShaderStage::TessEval) {
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage)
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
                   << ") Input (per-patch):  locations = " << inOutUsage.perPatchInputMapLocCount << "\n");
   }
   if (m_shaderStage == ShaderStage::TessControl) {
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage)
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
                   << ") Output (per-patch): locations = " << inOutUsage.perPatchOutputMapLocCount << "\n");
   }
   if (m_shaderStage == ShaderStage::Fragment) {
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage)
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
                   << ") Input (per-primitive):  locations = " << inOutUsage.perPrimitiveInputMapLocCount << "\n");
   }
   if (m_shaderStage == ShaderStage::Mesh) {
-    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage)
+    LLPC_OUTS("(" << getShaderStageAbbreviation(m_shaderStage.value())
                   << ") Output (per-primitive): locations = " << inOutUsage.perPrimitiveOutputMapLocCount << "\n");
   }
   LLPC_OUTS("\n");
@@ -2578,7 +2649,7 @@ void PatchResourceCollect::mapGsBuiltInOutput(unsigned builtInId, unsigned elemC
 // =====================================================================================================================
 // Update the inputLocInfoutputoMap, perPatchInputLocMap and perPrimitiveInputLocMap
 void PatchResourceCollect::updateInputLocInfoMapWithUnpack() {
-  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
+  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
   auto &inputLocInfoMap = inOutUsage.inputLocInfoMap;
   // Remove unused locationInfo
   bool eraseUnusedLocInfo = !m_pipelineState->isUnlinked(); // Should be whole pipeline compilation
@@ -2656,7 +2727,7 @@ void PatchResourceCollect::updateInputLocInfoMapWithUnpack() {
   // corresponding input location in the next stage. For example, if TCS output has dynamic location indexing from
   // [0,2], we need add the corresponding location info to TES input map. Otherwise, it will cause mismatch when the
   // dynamic indexing is in a loop and TES only uses location 1.
-  auto preStage = m_pipelineState->getPrevShaderStage(m_shaderStage);
+  auto preStage = m_pipelineState->getPrevShaderStage(m_shaderStage.value());
   if (preStage == ShaderStage::TessControl || preStage == ShaderStage::Mesh) {
     if (!inputLocInfoMap.empty()) {
       auto &outputLocInfoMap = m_pipelineState->getShaderResourceUsage(preStage.value())->inOutUsage.outputLocInfoMap;
@@ -2734,8 +2805,8 @@ void PatchResourceCollect::updateInputLocInfoMapWithUnpack() {
 // =====================================================================================================================
 // Clear unused output from outputLocInfoMap, perPatchOutputLocMap, and perPrimitiveOutputLocMap
 void PatchResourceCollect::clearUnusedOutput() {
-  auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
-  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
+  auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value());
+  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
   auto &outputLocInfoMap = inOutUsage.outputLocInfoMap;
   if (nextStage) {
     // Collect the locations of TCS's imported outputs
@@ -2878,8 +2949,8 @@ void PatchResourceCollect::clearUnusedOutput() {
 void PatchResourceCollect::updateOutputLocInfoMapWithUnpack() {
   clearUnusedOutput();
 
-  const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
-  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
+  const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value());
+  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
 
   //
   // Update per-vertex output location info
@@ -3133,7 +3204,7 @@ bool PatchResourceCollect::canChangeOutputLocationsForGs() {
 // =====================================================================================================================
 // Update inputLocInfoMap based on {TCS, GS, FS} input import calls
 void PatchResourceCollect::updateInputLocInfoMapWithPack() {
-  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
+  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
   auto &inputLocInfoMap = inOutUsage.inputLocInfoMap;
   inputLocInfoMap.clear();
 
@@ -3153,7 +3224,7 @@ void PatchResourceCollect::updateInputLocInfoMapWithPack() {
   bool isFsAndHasGs = (isFs && (m_pipelineState->hasShaderStage(ShaderStage::Geometry) || partPipelineHasGs));
   bool requireDword = isTcs || isGs || isFsAndHasGs;
   // Create locationMap
-  m_locationInfoMapManager->createMap(m_inputCalls, m_shaderStage, requireDword);
+  m_locationInfoMapManager->createMap(m_inputCalls, m_shaderStage.value(), requireDword);
 
   // Fill inputLocInfoMap of {TCS, GS, FS} for the packable calls
   unsigned newLocIdx = 0;
@@ -3176,112 +3247,128 @@ void PatchResourceCollect::updateInputLocInfoMapWithPack() {
 // =====================================================================================================================
 // Update outputLocInfoMap based on inputLocInfoMap of next stage or GS output export calls for copy shader
 void PatchResourceCollect::updateOutputLocInfoMapWithPack() {
-  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
+  auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
   auto &outputLocInfoMap = inOutUsage.outputLocInfoMap;
-  outputLocInfoMap.clear();
+  outputLocInfoMap.clear(); // Clear it, will reconstruct
 
   if (m_outputCalls.empty())
     return;
 
   assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
-         m_shaderStage == ShaderStage::Geometry);
-  auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
+         m_shaderStage == ShaderStage::Geometry); // Possible stages
+  auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value());
   auto &nextStageInputLocInfoMap =
       m_pipelineState->getShaderResourceUsage(nextStage.value())->inOutUsage.inputLocInfoMap;
 
   // Remove unused outputs and update the output map
   if (m_shaderStage != m_pipelineState->getLastVertexProcessingStage()) {
-    // For VS-{TCS, GS}, the dead output has no matching input of the next stage
+    // Not last vertex processing stage, collect dead outputs that have no matching inputs of the next stage.
+
+    // Collect dead output calls.
     for (auto call : m_outputCalls) {
       InOutLocationInfo origLocInfo;
       origLocInfo.setLocation(cast<ConstantInt>(call->getOperand(0))->getZExtValue());
       origLocInfo.setComponent(cast<ConstantInt>(call->getOperand(1))->getZExtValue());
-      if (nextStageInputLocInfoMap.find(origLocInfo) == nextStageInputLocInfoMap.end())
+      if (nextStageInputLocInfoMap.count(origLocInfo) == 0)
         m_deadCalls.push_back(call);
     }
-    // The output map should be equal to the input map of the next stage
+
+    // Use the input map of the next stage to update the output map of current stage.
     outputLocInfoMap = nextStageInputLocInfoMap;
   } else {
-    // For {VS, TES, GS}-FS, the dead output is neither a XFB output or a corresponding FS' input.
+    // Last vertex processing stage, collect dead outputs that are not XFB output or have no matching FS inputs.
     assert(nextStage == ShaderStage::Fragment);
 
-    // Collect XFB locations
-    auto &xfbOutLocInfoMap = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage.locInfoXfbOutInfoMap;
-    std::set<unsigned> xfbOutputLocs[MaxGsStreams];
-    for (const auto &locInfoPair : xfbOutLocInfoMap) {
-      const auto &locInfo = locInfoPair.first;
-      xfbOutputLocs[locInfo.getStreamId()].insert(locInfo.getLocation());
+    const unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream;
+
+    // Collect XFB output location pair <location, component>.
+    SmallSet<std::pair<unsigned, unsigned>, MaxInOutLocCount> xfbOutputLocPairs[MaxGsStreams];
+    auto &xfbOutInfoMap =
+        m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage.locInfoXfbOutInfoMap;
+    for (const auto &xfbOutLocInfo : xfbOutInfoMap) {
+      const auto &locInfo = xfbOutLocInfo.first;
+      if (locInfo.isBuiltIn())
+        continue; // Skip built-in outputs
+      xfbOutputLocPairs[locInfo.getStreamId()].insert(std::make_pair(locInfo.getLocation(), locInfo.getComponent()));
     }
 
-    // Store the output calls that have no corresponding input in FS
+    // Collect the output calls that have no corresponding FS inputs.
     std::vector<CallInst *> noMappedCalls;
     for (auto call : m_outputCalls) {
-      // NOTE: Don't set stream ID to the original output location info for GS. This is because the corresponding input
-      // location info of FS doesn't have stream ID. This will cause in-out mismatch.
-      InOutLocationInfo origLocInfo;
-      origLocInfo.setLocation(cast<ConstantInt>(call->getOperand(0))->getZExtValue());
-      origLocInfo.setComponent(cast<ConstantInt>(call->getOperand(1))->getZExtValue());
-
-      const unsigned origLocation = origLocInfo.getLocation();
-      const bool hasNoMappedInput = (nextStageInputLocInfoMap.find(origLocInfo) == nextStageInputLocInfoMap.end());
-      if (hasNoMappedInput) {
-        const unsigned streamId =
-            m_shaderStage == ShaderStage::Geometry ? cast<ConstantInt>(call->getOperand(2))->getZExtValue() : 0;
-
-        if (xfbOutputLocs[streamId].count(origLocation) == 0)
+      const unsigned location = cast<ConstantInt>(call->getOperand(0))->getZExtValue();
+      const unsigned component = cast<ConstantInt>(call->getOperand(1))->getZExtValue();
+      const unsigned streamId =
+          m_shaderStage == ShaderStage::Geometry ? cast<ConstantInt>(call->getOperand(2))->getZExtValue() : 0;
+
+      bool noMappedInput = true;
+      if (streamId == rasterStream) {
+        // Skip checking FS inputs if this output doesn't belong to rasterization stream.
+        InOutLocationInfo origLocInfo;
+        origLocInfo.setLocation(location);
+        origLocInfo.setComponent(component);
+        // NOTE: Don't set stream ID to the original output location info for GS. This is because the corresponding
+        // input location info of FS doesn't have stream ID. This will cause in-out mismatch.
+        noMappedInput = nextStageInputLocInfoMap.count(origLocInfo) == 0;
+      }
+
+      if (noMappedInput) {
+        if (xfbOutputLocPairs[streamId].count(std::make_pair(location, component)) == 0)
           m_deadCalls.push_back(call);
         else
           noMappedCalls.push_back(call);
       }
     }
-    // The output map of current stage contains at most two parts: the first part is consistent with FS input map and
-    // the second part is built from the no mapped calls.
-    std::vector<InOutLocationInfo> outLocInfos;
+
+    // The output map of current stage consists of two parts: the first part is consistent with FS input map and
+    // the second part is from the no mapped calls.
+    std::vector<InOutLocationInfo> noMappedOutputLocInfos;
     for (auto call : noMappedCalls) {
       InOutLocationInfo origLocInfo;
       origLocInfo.setLocation(cast<ConstantInt>(call->getOperand(0))->getZExtValue());
       origLocInfo.setComponent(cast<ConstantInt>(call->getOperand(1))->getZExtValue());
       if (m_shaderStage == ShaderStage::Geometry)
         origLocInfo.setStreamId(cast<ConstantInt>(call->getOperand(2))->getZExtValue());
-      outLocInfos.push_back(origLocInfo);
+      noMappedOutputLocInfos.push_back(origLocInfo);
     }
-    m_locationInfoMapManager->createMap(outLocInfos, m_shaderStage);
-    const auto &calcOutLocInfoMap = m_locationInfoMapManager->getMap();
+    m_locationInfoMapManager->createMap(noMappedOutputLocInfos, m_shaderStage.value());
+    const auto &noMappedOutputLocInfoMap = m_locationInfoMapManager->getMap();
 
+    // Reconstruct the first part of output map by using FS input map.
     if (m_shaderStage == ShaderStage::Geometry) {
-      // NOTE: The output location info from next shader stage (FS) doesn't contain raster stream ID. We have to
-      // reconstruct it.
-      const auto rasterStream = m_pipelineState->getRasterizerState().rasterStream;
-      for (auto &entry : nextStageInputLocInfoMap) {
-        InOutLocationInfo origLocInfo(entry.first);
-        origLocInfo.setStreamId(rasterStream);
-        InOutLocationInfo newLocInfo(entry.second);
-        newLocInfo.setStreamId(rasterStream);
-        outputLocInfoMap.insert({origLocInfo, newLocInfo});
+      if (rasterStream != InvalidValue) {
+        // NOTE: The output location info from FS doesn't contain rasterization stream ID. We have to reconstruct it.
+        for (auto &locInfo : nextStageInputLocInfoMap) {
+          InOutLocationInfo origLocInfo(locInfo.first);
+          origLocInfo.setStreamId(rasterStream);
+          InOutLocationInfo newLocInfo(locInfo.second);
+          newLocInfo.setStreamId(rasterStream);
+          outputLocInfoMap.insert({origLocInfo, newLocInfo});
+        }
       }
     } else {
       outputLocInfoMap = nextStageInputLocInfoMap;
     }
 
-    unsigned newLocMax = 0;
-    for (const auto &entry : outputLocInfoMap)
-      newLocMax = std::max(newLocMax, entry.second.getLocation() + 1);
-    // Update output map
-    for (const auto &entry : calcOutLocInfoMap) {
-      InOutLocationInfo origLocInfo;
-      origLocInfo.setStreamId(entry.first.getStreamId());
-      origLocInfo.setLocation(entry.first.getLocation());
-      origLocInfo.setComponent(entry.first.getComponent());
-      InOutLocationInfo newLocInfo(entry.second);
-      newLocInfo.setLocation(newLocInfo.getLocation() + newLocMax);
+    // Reconstruct the second part of output map by visiting each call of XFB output.
+    unsigned maxMappedLoc[MaxGsStreams] = {};
+    for (const auto &locInfo : outputLocInfoMap) {
+      maxMappedLoc[locInfo.first.getStreamId()] =
+          std::max(maxMappedLoc[locInfo.first.getStreamId()], locInfo.second.getLocation() + 1);
+    }
+
+    // Update output map for those XFB outputs.
+    for (const auto &locInfo : noMappedOutputLocInfoMap) {
+      InOutLocationInfo origLocInfo(locInfo.first);
+      InOutLocationInfo newLocInfo(locInfo.second);
+      newLocInfo.setLocation(newLocInfo.getLocation() + maxMappedLoc[locInfo.first.getStreamId()]);
       outputLocInfoMap.insert({origLocInfo, newLocInfo});
     }
 
-    // update output count per stream for GS
+    // Update output count per stream for GS
     if (m_shaderStage == ShaderStage::Geometry) {
-      for (auto &locInfoPair : outputLocInfoMap) {
-        auto &outLocCount = inOutUsage.gs.outLocCount[locInfoPair.first.getStreamId()];
-        outLocCount = std::max(outLocCount, locInfoPair.second.getLocation() + 1);
+      for (auto &locInfo : outputLocInfoMap) {
+        auto &outLocCount = inOutUsage.gs.outLocCount[locInfo.first.getStreamId()];
+        outLocCount = std::max(outLocCount, locInfo.second.getLocation() + 1);
       }
     }
   }
@@ -3292,7 +3379,7 @@ void PatchResourceCollect::updateOutputLocInfoMapWithPack() {
 void PatchResourceCollect::reassembleOutputExportCalls() {
   if (m_outputCalls.empty())
     return;
-  assert(m_pipelineState->canPackOutput(m_shaderStage));
+  assert(m_pipelineState->canPackOutput(m_shaderStage.value()));
 
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(m_outputCalls.back());
@@ -3317,7 +3404,7 @@ void PatchResourceCollect::reassembleOutputExportCalls() {
   };
 
   // Collect ElementsInfo in each packed location
-  auto &outputLocInfoMap = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage.outputLocInfoMap;
+  auto &outputLocInfoMap = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage.outputLocInfoMap;
   std::vector<ElementsInfo> elementsInfoArray(outputLocInfoMap.size());
 
   for (auto call : m_outputCalls) {
@@ -3746,7 +3833,7 @@ void PatchResourceCollect::clearUndefinedOutput() {
 
     for (auto call : candidateCalls) {
       // For unlinked case, we should keep the location info map unchanged.
-      if (m_pipelineState->getNextShaderStage(m_shaderStage)) {
+      if (m_pipelineState->getNextShaderStage(m_shaderStage.value())) {
         // Remove the output location info if it exists
         unsigned index = m_shaderStage == ShaderStage::Mesh ? 2 : 1;
         unsigned component = cast<ConstantInt>(call->getArgOperand(index))->getZExtValue();
diff --git a/lgc/patch/RegisterMetadataBuilder.cpp b/lgc/patch/RegisterMetadataBuilder.cpp
index 476a641d93..c982836fc9 100644
--- a/lgc/patch/RegisterMetadataBuilder.cpp
+++ b/lgc/patch/RegisterMetadataBuilder.cpp
@@ -684,9 +684,16 @@ void RegisterMetadataBuilder::buildHwVsRegisters() {
   vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::Streamout_1En] = enablePrimStats || streamXfbBuffers[1] > 0;
   vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::Streamout_2En] = enablePrimStats || streamXfbBuffers[2] > 0;
   vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::Streamout_3En] = enablePrimStats || streamXfbBuffers[3] > 0;
-  if (shaderStage == ShaderStage::CopyShader)
-    vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::RastStream] =
-        m_pipelineState->getRasterizerState().rasterStream;
+  if (shaderStage == ShaderStage::CopyShader) {
+    unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream;
+    if (m_pipelineState->getRasterizerState().rasterStream == InvalidValue) {
+      // NOTE: According to HW register spec, rasterization stream has 3 bits, the lower 2 bits are programmed to stream
+      // ID (0~3). If rasterization is not enabled for any stream, set the highest 1 bit to 1.
+      static const unsigned NoRasterStream = 0x4;
+      rasterStream = NoRasterStream;
+    }
+    vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::RastStream] = rasterStream;
+  }
 
   // Set some field of SPI_SHADER_PGM_RSRC2_VS
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VsStreamoutEn] = enableXfb;
diff --git a/lgc/patch/PatchLoadScalarizer.cpp b/lgc/patch/ScalarizeLoads.cpp
similarity index 98%
rename from lgc/patch/PatchLoadScalarizer.cpp
rename to lgc/patch/ScalarizeLoads.cpp
index a551bff048..964e0a9636 100644
--- a/lgc/patch/PatchLoadScalarizer.cpp
+++ b/lgc/patch/ScalarizeLoads.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchLoadScalarizer.cpp
+ * @file  ScalarizeLoads.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchLoadScalarizer.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchLoadScalarizer.h"
+#include "lgc/patch/ScalarizeLoads.h"
 #include "lgc/state/PipelineShaders.h"
 #include "lgc/state/PipelineState.h"
 #include "llvm/IR/Constants.h"
diff --git a/lgc/patch/ShaderInputs.cpp b/lgc/patch/ShaderInputs.cpp
index 0c6a1253a3..fd8b6c4a02 100644
--- a/lgc/patch/ShaderInputs.cpp
+++ b/lgc/patch/ShaderInputs.cpp
@@ -308,7 +308,7 @@ const char *ShaderInputs::getInputName(ShaderInput inputKind) {
 }
 
 // =====================================================================================================================
-// Gather usage of shader inputs from before PatchEntryPointMutate
+// Gather usage of shader inputs from before MutateEntryPoint
 //
 // @param module : IR module
 void ShaderInputs::gatherUsage(Module &module) {
@@ -397,7 +397,7 @@ void ShaderInputs::fixupUses(Module &module, PipelineState *pipelineState, bool
 
       // The new ShaderInputs scheme means that InOutBuilder or PatchResourceCollect no longer needs to set
       // the builtInUsage field for an input that is generated using ShaderInputs::getInput() and/or
-      // ShaderInputs::getSpecialUserData() (before PatchEntryPointMutate), and we can remove that
+      // ShaderInputs::getSpecialUserData() (before MutateEntryPoint), and we can remove that
       // builtInUsage field.
       //
       // However, in some cases, the builtInUsage field is used in NggPrimShader and/or Gfx*ConfigBuilder
@@ -735,7 +735,6 @@ uint64_t ShaderInputs::getShaderArgTys(PipelineState *pipelineState, ShaderStage
 //
 // @param stage : Shader stage
 ShaderInputs::ShaderInputsUsage *ShaderInputs::getShaderInputsUsage(ShaderStageEnum stage) {
-  m_shaderInputsUsage.resize(std::max(m_shaderInputsUsage.size(), static_cast<size_t>(stage) + 1));
   return &m_shaderInputsUsage[stage];
 }
 
diff --git a/lgc/state/PassManagerCache.cpp b/lgc/state/PassManagerCache.cpp
index 32f8f8425c..25a6325651 100644
--- a/lgc/state/PassManagerCache.cpp
+++ b/lgc/state/PassManagerCache.cpp
@@ -30,7 +30,7 @@
  */
 #include "lgc/state/PassManagerCache.h"
 #include "lgc/LgcContext.h"
-#include "lgc/patch/PatchLlvmIrInclusion.h"
+#include "lgc/patch/IncludeLlvmIr.h"
 #include "lgc/patch/PatchSetupTargetFeatures.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 442438
diff --git a/lgc/state/PipelineShaders.cpp b/lgc/state/PipelineShaders.cpp
index c14b41e61e..6ae6f579c3 100644
--- a/lgc/state/PipelineShaders.cpp
+++ b/lgc/state/PipelineShaders.cpp
@@ -44,8 +44,6 @@ AnalysisKey PipelineShaders::Key;
 
 // =====================================================================================================================
 PipelineShadersResult::PipelineShadersResult() {
-  for (auto &entryPoint : m_entryPoints)
-    entryPoint = nullptr;
 }
 
 // =====================================================================================================================
@@ -77,8 +75,7 @@ PipelineShadersResult PipelineShaders::run(Module &module, ModuleAnalysisManager
 //
 // @param shaderStage : Shader stage
 Function *PipelineShadersResult::getEntryPoint(ShaderStageEnum shaderStage) const {
-  assert((unsigned)shaderStage < ShaderStage::CountInternal);
-  return m_entryPoints[shaderStage];
+  return m_entryPoints.lookup(shaderStage);
 }
 
 // =====================================================================================================================
diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp
index b7a85906ee..947559e43e 100644
--- a/lgc/state/PipelineState.cpp
+++ b/lgc/state/PipelineState.cpp
@@ -589,9 +589,7 @@ ShaderStageMask PipelineState::getShaderStageMask() {
 // =====================================================================================================================
 // Check whether the pipeline is a graphics pipeline
 bool PipelineState::isGraphics() {
-  return getShaderStageMask().contains_any({ShaderStage::Task, ShaderStage::Vertex, ShaderStage::TessControl,
-                                            ShaderStage::TessEval, ShaderStage::Geometry, ShaderStage::Mesh,
-                                            ShaderStage::Fragment});
+  return getShaderStageMask().contains_any(ShaderStagesGraphics);
 }
 
 // =====================================================================================================================
@@ -600,8 +598,6 @@ bool PipelineState::isGraphics() {
 // @param stage : Shader stage
 // @param options : Shader options
 void PipelineState::setShaderOptions(ShaderStageEnum stage, const ShaderOptions &options) {
-  if (m_shaderOptions.size() <= stage)
-    m_shaderOptions.resize(stage + 1);
   m_shaderOptions[stage] = options;
 }
 
@@ -610,8 +606,6 @@ void PipelineState::setShaderOptions(ShaderStageEnum stage, const ShaderOptions
 //
 // @param stage : Shader stage
 const ShaderOptions &PipelineState::getShaderOptions(ShaderStageEnum stage) {
-  if (m_shaderOptions.size() <= stage)
-    m_shaderOptions.resize(stage + 1);
   return m_shaderOptions[stage];
 }
 
@@ -632,9 +626,12 @@ void PipelineState::recordOptions(Module *module) {
   if (unsigned preRasterHasGs = unsigned(m_preRasterHasGs))
     setNamedMetadataToArrayOfInt32(module, preRasterHasGs, PreRasterHasGsMetadataName);
   setNamedMetadataToArrayOfInt32(module, m_options, OptionsMetadataName);
-  for (unsigned stage = 0; stage != m_shaderOptions.size(); ++stage) {
-    std::string metadataName =
-        (Twine(OptionsMetadataName) + "." + getShaderStageAbbreviation(static_cast<ShaderStageEnum>(stage))).str();
+  // Iterate stages in deterministic order
+  for (auto stage : ShaderStagesNative) {
+    if (!m_shaderOptions.contains(stage))
+      continue;
+
+    std::string metadataName = (Twine(OptionsMetadataName) + "." + getShaderStageAbbreviation(stage)).str();
     setNamedMetadataToArrayOfInt32(module, m_shaderOptions[stage], metadataName);
   }
 }
@@ -664,13 +661,12 @@ void PipelineState::readOptions(Module *module) {
   m_preRasterHasGs = preRasterHasGsAsInt;
 
   readNamedMetadataArrayOfInt32(module, OptionsMetadataName, m_options);
-  for (unsigned stage = 0; stage != ShaderStage::Compute + 1; ++stage) {
+  for (auto stage : ShaderStagesNative) {
     std::string metadataName =
         (Twine(OptionsMetadataName) + "." + getShaderStageAbbreviation(static_cast<ShaderStageEnum>(stage))).str();
     auto namedMetaNode = module->getNamedMetadata(metadataName);
     if (!namedMetaNode || namedMetaNode->getNumOperands() == 0)
       continue;
-    m_shaderOptions.resize(stage + 1);
     readArrayOfInt32MetaNode(namedMetaNode->getOperand(0), m_shaderOptions[stage]);
   }
 }
@@ -769,7 +765,7 @@ void PipelineState::recordUserDataTable(ArrayRef<ResourceNode> nodes, NamedMDNod
     // Operand 1: matchType
     operands.push_back(ConstantAsMetadata::get(builder.getInt32(static_cast<uint32_t>(node.abstractType))));
     // Operand 2: visibility
-    operands.push_back(ConstantAsMetadata::get(builder.getInt32(node.visibility)));
+    operands.push_back(ConstantAsMetadata::get(builder.getInt32(node.visibility.toRaw())));
     // Operand 3: offsetInDwords
     operands.push_back(ConstantAsMetadata::get(builder.getInt32(node.offsetInDwords)));
     // Operand 4: sizeInDwords
@@ -840,7 +836,8 @@ void PipelineState::readUserDataNodes(Module *module) {
     nextNode->abstractType =
         static_cast<ResourceNodeType>(mdconst::extract<ConstantInt>(metadataNode->getOperand(1))->getZExtValue());
     // Operand 2: visibility
-    nextNode->visibility = mdconst::extract<ConstantInt>(metadataNode->getOperand(2))->getZExtValue();
+    nextNode->visibility =
+        ShaderStageMask::fromRaw(mdconst::extract<ConstantInt>(metadataNode->getOperand(2))->getZExtValue());
     // Operand 3: offsetInDwords
     nextNode->offsetInDwords = mdconst::extract<ConstantInt>(metadataNode->getOperand(3))->getZExtValue();
     // Operand 4: sizeInDwords
@@ -900,18 +897,22 @@ void PipelineState::readUserDataNodes(Module *module) {
 //
 // @param stage : Shader stage to check against nodes' visibility field, or ShaderStage::Invalid for any
 const ResourceNode *PipelineState::findPushConstantResourceNode(std::optional<ShaderStageEnum> stage) const {
-  unsigned visibilityMask = UINT_MAX;
-  if (stage)
-    visibilityMask = 1 << std::min(unsigned(stage.value()), unsigned(ShaderStage::Compute));
+  ShaderStageMask visibilityMask(ShaderStages);
+  if (stage) {
+    ShaderStageEnum maskStage = stage.value();
+    if (!ShaderStageMask(ShaderStagesNative).contains(maskStage))
+      maskStage = ShaderStage::Compute;
+    visibilityMask = ShaderStageMask(maskStage);
+  }
 
   for (const ResourceNode &node : getUserDataNodes()) {
-    if (node.visibility != 0 && (node.visibility & visibilityMask) == 0)
+    if (!node.visibility.empty() && (node.visibility & visibilityMask).empty())
       continue;
     if (node.concreteType == ResourceNodeType::PushConst)
       return &node;
     if (node.concreteType == ResourceNodeType::DescriptorTableVaPtr) {
       if (!node.innerTable.empty() && node.innerTable[0].concreteType == ResourceNodeType::PushConst) {
-        if (node.innerTable[0].visibility != 0 && (node.innerTable[0].visibility & visibilityMask) == 0)
+        if (!node.innerTable[0].visibility.empty() && (node.innerTable[0].visibility & visibilityMask).empty())
           continue;
         assert(ResourceLayoutScheme::Indirect == m_options.resourceLayoutScheme);
         return &node;
@@ -921,50 +922,6 @@ const ResourceNode *PipelineState::findPushConstantResourceNode(std::optional<Sh
   return nullptr;
 }
 
-// =====================================================================================================================
-// Returns true when type nodeType is compatible with candidateType.
-// A node type is compatible with a candidate type iff (nodeType) <= (candidateType) in the ResourceNodeType lattice:
-//
-// DescriptorBufferCompact
-//        +                                               DescriptorCombinedTexture
-//        |         DescriptorConstBufferCompact                     +
-//        |             +                                            |
-//        |             |    InlineBuffer                            |
-//        |             |      +                 +-------------------+--------------------+
-//        |             |      |                 |                   |                    |
-//        v             v      v                 v                   v                    v
-// DescriptorBuffer  DescriptorConstBuffer  DescriptorResource  DescriptorTexelBuffer  DescriptorSampler
-//          +            +                       +                   +                    +
-//          |            |                       |                   |                    |
-//          v            v                       |                   |                    |
-//       DescriptorAnyBuffer                     v                   |                    |
-//                +-------------------------> Unknown <--------------+--------------------+
-//
-// @param nodeType : Resource node type
-// @param candidateType : Resource node candidate type
-static bool isNodeTypeCompatible(ResourceNodeType nodeType, ResourceNodeType candidateType) {
-  if (nodeType == ResourceNodeType::Unknown || candidateType == nodeType ||
-      candidateType == ResourceNodeType::DescriptorMutable)
-    return true;
-
-  if ((nodeType == ResourceNodeType::DescriptorConstBuffer || nodeType == DescriptorAnyBuffer) &&
-      (candidateType == ResourceNodeType::DescriptorConstBufferCompact ||
-       candidateType == ResourceNodeType::DescriptorConstBuffer || candidateType == ResourceNodeType::InlineBuffer))
-    return true;
-
-  if ((nodeType == ResourceNodeType::DescriptorBuffer || nodeType == DescriptorAnyBuffer) &&
-      (candidateType == ResourceNodeType::DescriptorBufferCompact ||
-       candidateType == ResourceNodeType::DescriptorBuffer))
-    return true;
-
-  if ((nodeType == ResourceNodeType::DescriptorResource || nodeType == ResourceNodeType::DescriptorTexelBuffer ||
-       nodeType == ResourceNodeType::DescriptorSampler) &&
-      candidateType == ResourceNodeType::DescriptorCombinedTexture)
-    return true;
-
-  return false;
-}
-
 // =====================================================================================================================
 // Returns true when type is one that has a binding.
 // @param nodeType : Resource node type
@@ -998,12 +955,10 @@ static bool nodeTypeHasBinding(ResourceNodeType nodeType) {
 // sizeInDwords/stride.
 //
 // @param node : Node to try and match
-// @param nodeType : Resource node type being searched for
 // @param descSet : Descriptor set being searched for
 // @param binding : Descriptor binding being searched for
-bool PipelineState::matchResourceNode(const ResourceNode &node, ResourceNodeType nodeType, uint64_t descSet,
-                                      unsigned binding) const {
-  if (node.set != descSet || !isNodeTypeCompatible(nodeType, node.abstractType))
+bool PipelineState::matchResourceNode(const ResourceNode &node, uint64_t descSet, unsigned binding) const {
+  if (node.set != descSet)
     return false;
   if (node.binding == binding)
     return true;
@@ -1032,14 +987,18 @@ bool PipelineState::matchResourceNode(const ResourceNode &node, ResourceNodeType
 std::pair<const ResourceNode *, const ResourceNode *>
 PipelineState::findResourceNode(ResourceNodeType nodeType, uint64_t descSet, unsigned binding,
                                 std::optional<ShaderStageEnum> stage) const {
-  unsigned visibilityMask = UINT_MAX;
-  if (stage)
-    visibilityMask = 1 << std::min(unsigned(stage.value()), unsigned(ShaderStage::Compute));
+  ShaderStageMask visibilityMask(ShaderStages);
+  if (stage) {
+    ShaderStageEnum maskStage = stage.value();
+    if (!ShaderStageMask(ShaderStagesNative).contains(maskStage))
+      maskStage = ShaderStage::Compute;
+    visibilityMask = ShaderStageMask(maskStage);
+  }
 
   for (const ResourceNode &node : getUserDataNodes()) {
     if (!nodeTypeHasBinding(node.concreteType))
       continue;
-    if (node.visibility != 0 && (node.visibility & visibilityMask) == 0)
+    if (!node.visibility.empty() && (node.visibility & visibilityMask).empty())
       continue;
 
     if (node.concreteType == ResourceNodeType::DescriptorTableVaPtr) {
@@ -1053,12 +1012,12 @@ PipelineState::findResourceNode(ResourceNodeType nodeType, uint64_t descSet, uns
 
       // Check inner nodes.
       for (const ResourceNode &innerNode : node.innerTable) {
-        if (innerNode.visibility != 0 && (innerNode.visibility & visibilityMask) == 0)
+        if (!innerNode.visibility.empty() && (innerNode.visibility & visibilityMask).empty())
           continue;
-        if (matchResourceNode(innerNode, nodeType, descSet, binding))
+        if (matchResourceNode(innerNode, descSet, binding))
           return {&node, &innerNode};
       }
-    } else if (matchResourceNode(node, nodeType, descSet, binding))
+    } else if (matchResourceNode(node, descSet, binding))
       return {&node, &node};
   }
 
@@ -1081,13 +1040,18 @@ PipelineState::findResourceNode(ResourceNodeType nodeType, uint64_t descSet, uns
 //
 // @param nodeType : Type of the resource mapping node
 // @param stage : Shader stage to check against nodes' visibility field, or ShaderStage::Invalid for any
-const ResourceNode *PipelineState::findSingleRootResourceNode(ResourceNodeType nodeType, ShaderStageEnum stage) const {
-  unsigned visibilityMask = UINT_MAX;
-  if (stage != ShaderStage::Invalid)
-    visibilityMask = 1 << std::min(unsigned(stage), unsigned(ShaderStage::Compute));
+const ResourceNode *PipelineState::findSingleRootResourceNode(ResourceNodeType nodeType,
+                                                              std::optional<ShaderStageEnum> stage) const {
+  ShaderStageMask visibilityMask(ShaderStages);
+  if (stage) {
+    ShaderStageEnum maskStage = stage.value();
+    if (!ShaderStageMask(ShaderStagesNative).contains(maskStage))
+      maskStage = ShaderStage::Compute;
+    visibilityMask = ShaderStageMask(maskStage);
+  }
 
   for (const ResourceNode &node : getUserDataNodes()) {
-    if (node.visibility != 0 && (node.visibility & visibilityMask) == 0)
+    if (!node.visibility.empty() && (node.visibility & visibilityMask).empty())
       continue;
     if (node.concreteType == nodeType)
       return &node;
@@ -1356,7 +1320,7 @@ unsigned PipelineState::getShaderWaveSize(ShaderStageEnum stage) {
     stage = ShaderStage::Geometry;
   }
 
-  assert(stage <= ShaderStage::Compute);
+  assert(ShaderStageMask(ShaderStagesNative).contains(stage));
   if (!m_waveSize[stage])
     setShaderDefaultWaveSize(stage);
 
@@ -1655,10 +1619,9 @@ bool PipelineState::getShaderWgpMode(ShaderStageEnum stage) const {
     stage = ShaderStage::Geometry;
   }
 
-  assert(stage <= ShaderStage::Compute);
-  assert(stage < m_shaderOptions.size());
+  assert(ShaderStageMask(ShaderStagesNative).contains(stage));
 
-  return m_shaderOptions[stage].wgpMode;
+  return m_shaderOptions.lookup(stage).wgpMode;
 }
 
 // =====================================================================================================================
@@ -1704,7 +1667,7 @@ bool PipelineState::enableSwXfb() {
   lastVertexStage = lastVertexStage == ShaderStage::CopyShader ? ShaderStage::Geometry : lastVertexStage;
 
   if (!lastVertexStage) {
-    assert(isUnlinked()); // Unlinked fragment shader or part-pipeline
+    assert(!isWholePipeline()); // Unlinked fragment shader or part-pipeline
     return false;
   }
 
@@ -1735,7 +1698,7 @@ ResourceUsage *PipelineState::getShaderResourceUsage(ShaderStageEnum shaderStage
   if (shaderStage == ShaderStage::CopyShader)
     shaderStage = ShaderStage::Geometry;
 
-  auto &resUsage = MutableArrayRef<std::unique_ptr<ResourceUsage>>(m_resourceUsage)[shaderStage];
+  auto &resUsage = m_resourceUsage[shaderStage];
   if (!resUsage) {
     resUsage = std::make_unique<ResourceUsage>(shaderStage);
   }
@@ -1750,7 +1713,7 @@ InterfaceData *PipelineState::getShaderInterfaceData(ShaderStageEnum shaderStage
   if (shaderStage == ShaderStage::CopyShader)
     shaderStage = ShaderStage::Geometry;
 
-  auto &intfData = MutableArrayRef<std::unique_ptr<InterfaceData>>(m_interfaceData)[shaderStage];
+  auto &intfData = m_interfaceData[shaderStage];
   if (!intfData) {
     intfData = std::make_unique<InterfaceData>();
   }
diff --git a/lgc/state/RayTracingLibrarySummary.cpp b/lgc/state/RayTracingLibrarySummary.cpp
index 9ac4fa0eab..2a6c45aa9a 100644
--- a/lgc/state/RayTracingLibrarySummary.cpp
+++ b/lgc/state/RayTracingLibrarySummary.cpp
@@ -38,7 +38,7 @@ using namespace lgc;
 namespace {
 namespace RtLibSummary {
 
-constexpr unsigned MajorVersion = 1;
+constexpr unsigned MajorVersion = 2;
 
 static constexpr char Version[] = "version";
 static constexpr char UsesTraceRay[] = "uses_trace_ray";
@@ -46,9 +46,9 @@ static constexpr char KnownSetRayFlags[] = "ray_flags_known_set";
 static constexpr char KnownUnsetRayFlags[] = "ray_flags_known_unset";
 static constexpr char MaxRayPayloadSize[] = "max_ray_payload_size";
 static constexpr char MaxHitAttributeSize[] = "max_hit_attribute_size";
-static constexpr char MaxUsedPayloadRegisterCount[] = "max_used_payload_register_count";
 static constexpr char HasKernelEntry[] = "has_kernel_entry";
 static constexpr char HasTraceRayModule[] = "has_trace_ray_module";
+static constexpr char LlvmRaytracingState[] = "llvm_raytracing_state";
 
 } // namespace RtLibSummary
 } // anonymous namespace
@@ -81,9 +81,12 @@ Expected<RayTracingLibrarySummary> RayTracingLibrarySummary::decodeMsgpack(Strin
   getUInt(root[RtLibSummary::KnownUnsetRayFlags], rls.knownUnsetRayFlags);
   getUInt(root[RtLibSummary::MaxRayPayloadSize], rls.maxRayPayloadSize);
   getUInt(root[RtLibSummary::MaxHitAttributeSize], rls.maxHitAttributeSize);
-  getUInt(root[RtLibSummary::MaxUsedPayloadRegisterCount], rls.maxUsedPayloadRegisterCount);
   getBool(root[RtLibSummary::HasKernelEntry], rls.hasKernelEntry);
   getBool(root[RtLibSummary::HasTraceRayModule], rls.hasTraceRayModule);
+  auto errorOrState = llvmraytracing::PipelineState::decodeMsgpack(root[RtLibSummary::LlvmRaytracingState]);
+  if (auto error = errorOrState.takeError())
+    return error;
+  rls.llvmRaytracingState = *errorOrState;
 
   return rls;
 }
@@ -100,9 +103,9 @@ std::string RayTracingLibrarySummary::encodeMsgpack() const {
   root[RtLibSummary::KnownUnsetRayFlags] = knownUnsetRayFlags;
   root[RtLibSummary::MaxRayPayloadSize] = maxRayPayloadSize;
   root[RtLibSummary::MaxHitAttributeSize] = maxHitAttributeSize;
-  root[RtLibSummary::MaxUsedPayloadRegisterCount] = maxUsedPayloadRegisterCount;
   root[RtLibSummary::HasKernelEntry] = hasKernelEntry;
   root[RtLibSummary::HasTraceRayModule] = hasTraceRayModule;
+  llvmRaytracingState.encodeMsgpack(root[RtLibSummary::LlvmRaytracingState]);
 
   std::string out;
   doc.writeToBlob(out);
@@ -117,9 +120,9 @@ void RayTracingLibrarySummary::merge(const RayTracingLibrarySummary &other) {
   }
   maxRayPayloadSize = std::max(maxRayPayloadSize, other.maxRayPayloadSize);
   maxHitAttributeSize = std::max(maxHitAttributeSize, other.maxHitAttributeSize);
-  maxUsedPayloadRegisterCount = std::max(maxUsedPayloadRegisterCount, other.maxUsedPayloadRegisterCount);
 
   // TODO: Inherit kernel entry and trace ray module if possible and avoid recompile?
   hasKernelEntry = false;
   hasTraceRayModule = false;
+  llvmRaytracingState.merge(other.llvmRaytracingState);
 }
diff --git a/lgc/state/ShaderModes.cpp b/lgc/state/ShaderModes.cpp
index aec7d7699c..fa8d4cffba 100644
--- a/lgc/state/ShaderModes.cpp
+++ b/lgc/state/ShaderModes.cpp
@@ -51,7 +51,7 @@ static const char ComputeShaderModeMetadataName[] = "llpc.compute.mode";
 // =====================================================================================================================
 // Clear shader modes
 void ShaderModes::clear() {
-  memset(m_commonShaderModes, 0, sizeof(m_commonShaderModes));
+  m_commonShaderModes.clear();
 }
 
 // =====================================================================================================================
@@ -82,14 +82,16 @@ CommonShaderMode ShaderModes::getCommonShaderMode(Module &module, ShaderStageEnu
 //
 // @param stage : Shader stage
 const CommonShaderMode &ShaderModes::getCommonShaderMode(ShaderStageEnum stage) const {
-  return ArrayRef<CommonShaderMode>(m_commonShaderModes)[stage];
+  auto mode = m_commonShaderModes.find(stage);
+  assert(mode != m_commonShaderModes.end());
+  return mode->second;
 }
 
 // =====================================================================================================================
 // Check if any shader stage has useSubgroupSize set
 bool ShaderModes::getAnyUseSubgroupSize() const {
   for (const auto &commonShaderMode : m_commonShaderModes) {
-    if (commonShaderMode.useSubgroupSize)
+    if (commonShaderMode.second.useSubgroupSize)
       return true;
   }
   return false;
@@ -222,8 +224,8 @@ void ShaderModes::setSubgroupSizeUsage(Module &module, ShaderStageEnum stage, bo
 // @param module : LLVM module
 void ShaderModes::readModesFromPipeline(Module *module) {
   // First the common state.
-  for (unsigned stage = 0; stage < ArrayRef<CommonShaderMode>(m_commonShaderModes).size(); ++stage)
-    m_commonShaderModes[stage] = getCommonShaderMode(*module, ShaderStageEnum(stage));
+  for (auto stage : ShaderStagesNative)
+    m_commonShaderModes[stage] = getCommonShaderMode(*module, stage);
 
   // Then the specific shader modes except tessellation.
   PipelineState::readNamedMetadataArrayOfInt32(module, GeometryShaderModeMetadataName, m_geometryShaderMode);
diff --git a/lgc/state/TargetInfo.cpp b/lgc/state/TargetInfo.cpp
index 73f0a1a59e..0b0aae94ce 100644
--- a/lgc/state/TargetInfo.cpp
+++ b/lgc/state/TargetInfo.cpp
@@ -35,10 +35,38 @@
 using namespace lgc;
 using namespace llvm;
 
+namespace llvm {
+namespace cl {
+// Define a category for Helper options.
+OptionCategory AmdCategory{"Helper Options"};
+} // namespace cl
+} // namespace llvm
+
 // -native-wave-size: an option to override hardware native wave size, it will allow compiler to choose
 // final wave size base on it. Used in pre-silicon verification.
 static cl::opt<int> NativeWaveSize("native-wave-size", cl::desc("Overrides hardware native wave size"), cl::init(0));
 
+namespace {
+
+class TargetInfoPrinter {
+public:
+  void print();
+
+  void operator=(bool value) {
+    if (!value)
+      return;
+    print();
+    exit(0);
+  }
+};
+
+TargetInfoPrinter TargetInfoPrinterInstance;
+
+cl::opt<TargetInfoPrinter, true, cl::parser<bool>> TargetPrinter{
+    "targetInfo", cl::desc("Display the supported device infos."), cl::location(TargetInfoPrinterInstance),
+    cl::cat(cl::AmdCategory)};
+} // namespace
+
 // =====================================================================================================================
 // Functions to set up TargetInfo for the various targets
 
@@ -349,46 +377,49 @@ static void setGfx115FInfo(TargetInfo *targetInfo) {
 }
 #endif
 
-// =====================================================================================================================
-// Set TargetInfo. Returns false if the GPU name is not found or not supported.
-//
-// @param gpuName : LLVM GPU name, e.g. "gfx900"
-bool TargetInfo::setTargetInfo(StringRef gpuName) {
-  struct GpuNameStringMap {
-    const char *gpuName;
-    void (*setTargetInfoFunc)(TargetInfo *targetInfo);
-  };
+// Represents device infos.
+struct GpuNameStringMap {
+  const char *gpuName;
+  const char *deviceName;
+  void (*setTargetInfoFunc)(TargetInfo *targetInfo);
+};
 
-  static const GpuNameStringMap GpuNameMap[] = {
-    {"gfx1010", &setGfx1010Info}, // gfx1010
+// The supported device list
+static const GpuNameStringMap GpuNameMap[] = {
+    {"gfx1010", "Navi10", &setGfx1010Info}, // gfx1010
 #if LLPC_BUILD_NAVI12
-    {"gfx1011", &setGfx1011Info}, // gfx1011, navi12
+    {"gfx1011", "Navi12", &setGfx1011Info}, // gfx1011
 #endif
-    {"gfx1012", &setGfx1012Info}, // gfx1012, navi14
-    {"gfx1030", &setGfx1030Info}, // gfx1030, navi21
-    {"gfx1031", &setGfx1031Info}, // gfx1031, navi22
-    {"gfx1032", &setGfx1032Info}, // gfx1032, navi23
-    {"gfx1034", &setGfx1034Info}, // gfx1034, navi24
+    {"gfx1012", "Navi14", &setGfx1012Info}, // gfx1012
+    {"gfx1030", "Navi21", &setGfx1030Info}, // gfx1030
+    {"gfx1031", "Navi22", &setGfx1031Info}, // gfx1031
+    {"gfx1032", "Navi23", &setGfx1032Info}, // gfx1032
+    {"gfx1034", "Navi24", &setGfx1034Info}, // gfx1034
 #if LLPC_BUILD_REMBRANDT
-    {"gfx1035", &setGfx1035Info}, // gfx1035, rembrandt
+    {"gfx1035", "Rembrandt", &setGfx1035Info}, // gfx1035
 #endif
 #if LLPC_BUILD_RAPHAEL || LLPC_BUILD_MENDOCINO
-    {"gfx1036", &setGfx1036Info}, // gfx1036, raphael | mendocino
+    {"gfx1036", "Raphael", &setGfx1036Info}, // gfx1036
 #endif
-    {"gfx1100", &setGfx1100Info}, // gfx1100, navi31
+    {"gfx1100", "Navi31", &setGfx1100Info}, // gfx1100
 #if LLPC_BUILD_NAVI32
-    {"gfx1101", &setGfx1101Info}, // gfx1101, navi32
+    {"gfx1101", "Navi32", &setGfx1101Info}, // gfx1101
 #endif
-    {"gfx1102", &setGfx1102Info}, // gfx1102, navi33
+    {"gfx1102", "Navi33", &setGfx1102Info}, // gfx1102
 #if LLPC_BUILD_PHOENIX1 || LLPC_BUILD_PHOENIX2
-    {"gfx1103", &setGfx1103Info}, // gfx1103, phoenix1
+    {"gfx1103", "Phoenix1", &setGfx1103Info}, // gfx1103
 #endif
 #if LLPC_BUILD_STRIX1
-    {"gfx1150", &setGfx1150Info}, // gfx1150, strix
-    {"gfx115F", &setGfx115FInfo}, // gfx115F, strix A0
+    {"gfx1150", "Strix1", &setGfx1150Info},    // gfx1150
+    {"gfx115F", "Strix1 A0", &setGfx115FInfo}, // gfx115F
 #endif
-  };
+};
 
+// =====================================================================================================================
+// Set TargetInfo. Returns false if the GPU name is not found or not supported.
+//
+// @param gpuName : LLVM GPU name, e.g. "gfx900"
+bool TargetInfo::setTargetInfo(StringRef gpuName) {
   void (*setTargetInfoFunc)(TargetInfo * targetInfo) = nullptr;
   for (const GpuNameStringMap &mapEntry : ArrayRef<GpuNameStringMap>(GpuNameMap)) {
     if (gpuName == mapEntry.gpuName) {
@@ -413,3 +444,13 @@ bool TargetInfo::setTargetInfo(StringRef gpuName) {
 
   return true;
 }
+
+// =====================================================================================================================
+// Print the target infos
+void TargetInfoPrinter::print() {
+  unsigned count = sizeof(GpuNameMap) / sizeof(GpuNameMap[0]);
+  for (unsigned i = 0; i < count; ++i) {
+    // Remove substring "gfx"
+    outs() << StringRef(GpuNameMap[i].gpuName).drop_front(3) << "  " << GpuNameMap[i].deviceName << '\n';
+  }
+}
diff --git a/lgc/test/CallLibFromCs-indirect.lgc b/lgc/test/CallLibFromCs-indirect.lgc
index 4c5f4a82a8..9f0d275e30 100644
--- a/lgc/test/CallLibFromCs-indirect.lgc
+++ b/lgc/test/CallLibFromCs-indirect.lgc
@@ -1,6 +1,6 @@
 ; Call an extern compute library function from a compute shader.
 
-; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-mutate-entry-point -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s
 ; CHECK: IR Dump After Patch LLVM for entry-point mutation
 ; CHECK: define dllexport amdgpu_cs void @lgc.shader.CS.main(i32 inreg noundef %globalTable, ptr addrspace(4) inreg noundef %numWorkgroupsPtr, i32 inreg noundef %userdata0, i32 inreg noundef %userdata1, i32 inreg noundef %userdata2, i32 inreg noundef %userdata3, i32 inreg noundef %userdata4, i32 inreg noundef %userdata5, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %userdata9, i32 inreg noundef %userdata10, i32 inreg noundef %userdata11, i32 inreg noundef %spillTable, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !lgc.shaderstage !7 {
 ; CHECK: call amdgpu_gfx i32 %func_ptr(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %userdata0, i32 inreg %userdata1, i32 inreg %userdata2, i32 inreg %userdata3, i32 inreg %userdata4, i32 inreg %userdata5, i32 inreg %userdata6, i32 inreg %userdata7, i32 inreg %userdata8, i32 inreg %userdata9, i32 inreg %userdata10, i32 inreg %userdata11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId)
diff --git a/lgc/test/CallLibFromCs.lgc b/lgc/test/CallLibFromCs.lgc
index 3f68d40e8f..57f61b9b4f 100644
--- a/lgc/test/CallLibFromCs.lgc
+++ b/lgc/test/CallLibFromCs.lgc
@@ -1,6 +1,6 @@
 ; Call an extern compute library function from a compute shader.
 
-; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-mutate-entry-point -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s
 ; CHECK: IR Dump After Patch LLVM for entry-point mutation
 ; CHECK: declare amdgpu_gfx i32 @compute_library_func() #0
 ; CHECK: define dllexport amdgpu_cs void @lgc.shader.CS.main(i32 inreg noundef %globalTable, ptr addrspace(4) inreg noundef %numWorkgroupsPtr, i32 inreg noundef %userdata0, i32 inreg noundef %userdata1, i32 inreg noundef %userdata2, i32 inreg noundef %userdata3, i32 inreg noundef %userdata4, i32 inreg noundef %userdata5, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %userdata9, i32 inreg noundef %userdata10, i32 inreg noundef %userdata11, i32 inreg noundef %spillTable, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #1 !lgc.shaderstage !7 {
diff --git a/lgc/test/CsComputeLibrary.lgc b/lgc/test/CsComputeLibrary.lgc
index 1368e78100..795fff8f49 100644
--- a/lgc/test/CsComputeLibrary.lgc
+++ b/lgc/test/CsComputeLibrary.lgc
@@ -1,6 +1,6 @@
 ; Define a compute library that can be called from a compute shader.
 
-; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -print-after=lgc-patch-prepare-pipeline-abi -print-after=lgc-patch-setup-target-features -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-mutate-entry-point -print-after=lgc-patch-prepare-pipeline-abi -print-after=lgc-patch-setup-target-features -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s
 ; CHECK: IR Dump After Patch LLVM for entry-point mutation
 ; CHECK: define amdgpu_gfx void @func(i32 inreg noundef %globalTable, ptr addrspace(4) inreg noundef %numWorkgroupsPtr, i32 inreg noundef %userdata0, i32 inreg noundef %userdata1, i32 inreg noundef %userdata2, i32 inreg noundef %userdata3, i32 inreg noundef %userdata4, i32 inreg noundef %userdata5, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %userdata9, i32 inreg noundef %userdata10, i32 inreg noundef %userdata11, i32 inreg noundef %spillTable, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !lgc.shaderstage !7 {
 ; CHECK: !7 = !{i32 7}
diff --git a/lgc/test/TaskShaderOps.lgc b/lgc/test/TaskShaderOps.lgc
index 88a85fee82..65b1862ae0 100644
--- a/lgc/test/TaskShaderOps.lgc
+++ b/lgc/test/TaskShaderOps.lgc
@@ -16,7 +16,7 @@
 ; CHECK-NEXT: [[dimX:%[0-9]*]] = extractelement <3 x i32> %meshTaskDispatchDims, i64 0
 ; CHECK-NEXT: [[tempResult2:%[0-9]*]] = mul i32 [[tempResult1]], [[dimX]]
 ; CHECK-NEXT: [[flattenId:%[0-9]*]] = add i32 [[tempResult2]], [[groupIdX]]
-; CHECK-NEXT: [[entryIndex:%[0-9]*]] = add i32 [[flattenId]], %meshTaskRingIndex
+; CHECK-NEXT: [[entryIndex:%[0-9]*]] = add i32 {{(%meshTaskRingIndex, )?}}[[flattenId]]{{(, %meshTaskRingIndex)?}}{{$}}
 ; CHECK: [[drawDataRingDescPtr:%[0-9]*]] = getelementptr {{i8|<4 x i32>}}, ptr addrspace(4) %{{[0-9]*}}, i64 {{224|14}}
 ; CHECK-NEXT: [[drawDataRingDesc:%[0-9]*]] = load <4 x i32>, ptr addrspace(4) [[drawDataRingDescPtr]], align 16
 ; CHECK: [[payloadRingDescPtr:%[0-9]*]] = getelementptr {{i8|<4 x i32>}}, ptr addrspace(4) %{{[0-9]*}}, i64 {{208|13}}
diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc
index 7ed782e130..17289c70a5 100644
--- a/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc
+++ b/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc
@@ -5,13 +5,13 @@ define void @matmul_f16(ptr %ptr) {
 ; CHECK-LABEL: define void @matmul_f16
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ACCUM_LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[ACCUM_LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0, i32 0)
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi <8 x float> [ [[ACCUM_LOAD]], [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[A:%.*]] = call <8 x float> @getmat1()
 ; CHECK-NEXT:    [[B:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; CHECK:       end:
@@ -19,7 +19,7 @@ define void @matmul_f16(ptr %ptr) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %accum.load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %accum.load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0, i32 0)
   br label %loop
 
 loop:
@@ -29,7 +29,7 @@ loop:
   %b = call <8 x float> @getmat1()
 
   %accum.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum.phi, i32 1, i32 1, i32 0, i32 1)
-  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %accum.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0)
 
   %cc = call i1 @getcc()
@@ -49,7 +49,7 @@ define void @matmul_f16_initzero(ptr %ptr) {
 ; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi <8 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[A:%.*]] = call <8 x float> @getmat1()
 ; CHECK-NEXT:    [[B:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; CHECK:       end:
@@ -66,7 +66,7 @@ loop:
   %b = call <8 x float> @getmat1()
 
   %accum.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum.phi, i32 1, i32 1, i32 0, i32 1)
-  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %accum.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0)
 
   %cc = call i1 @getcc()
diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc
index 9bff238d2c..29fa46b2bb 100644
--- a/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc
+++ b/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc
@@ -6,8 +6,8 @@ define void @matmul_f16_pack_simple(ptr %out0, ptr %out1, <8 x float> %a, <8 x f
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true)
@@ -15,8 +15,8 @@ define void @matmul_f16_pack_simple(ptr %out0, ptr %out1, <8 x float> %a, <8 x f
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi)
   ret void
@@ -27,10 +27,10 @@ define void @matmul_f16_pack_chain_sequential(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 false)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 true)
@@ -38,10 +38,10 @@ define void @matmul_f16_pack_chain_sequential(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
@@ -52,10 +52,10 @@ define void @matmul_f16_pack_chain_alternating(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 false)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 true)
@@ -63,10 +63,10 @@ define void @matmul_f16_pack_chain_alternating(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
@@ -77,10 +77,10 @@ define void @matmul_f16_pack_chain_nested(ptr %out0, ptr %out1, <8 x float> %a,
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_2]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_2]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 false)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 true)
@@ -88,10 +88,10 @@ define void @matmul_f16_pack_chain_nested(ptr %out0, ptr %out1, <8 x float> %a,
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
@@ -101,14 +101,14 @@ define void @matmul_f16_no_packable_chain(ptr %out0, ptr %out1, <8 x float> %a,
 ; GFX11-LABEL: define void @matmul_f16_no_packable_chain
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_1]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.1)
   ret void
 }
@@ -118,16 +118,16 @@ define void @matmul_f16_chain_loop(ptr %out0, ptr %out1, <8 x float> %a, <8 x fl
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
 ; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
 ; GFX11-NEXT:    br label [[LOOP:%.*]]
 ; GFX11:       loop:
 ; GFX11-NEXT:    [[ACCUM1_PHI:%.*]] = phi <8 x float> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[CHAIN1_2:%.*]], [[LOOP]] ]
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_2]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; GFX11:       end:
@@ -138,16 +138,16 @@ define void @matmul_f16_chain_loop(ptr %out0, ptr %out1, <8 x float> %a, <8 x fl
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   br label %loop
 
 loop:
   %accum0.phi = phi <8 x float> [ %chain0.1, %entry ], [ %chain0.2, %loop ]
   %accum1.phi = phi <8 x float> [ %chain1.1, %entry ], [ %chain1.2, %loop ]
 
-  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
@@ -171,8 +171,8 @@ define void @matmul_f16_chain_loop_phis(ptr %out0, ptr %out1, <8 x float> %a, <8
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; GFX11:       loop:
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 false, i1 false, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 false, i1 false, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 false, i1 false, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 false, i1 false, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[HEADER]]
 ; GFX11:       end:
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[ACCUM1_PHI]], i1 false)
@@ -195,8 +195,8 @@ header:
 loop:
   %accum0.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum0.phi, i32 1, i32 1, i32 0, i32 1)
   %accum1.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum1.phi, i32 1, i32 1, i32 0, i32 1)
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1, i32 1)
   %accum0.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdLo, i32 1, i32 1, i32 1, i32 0)
   %accum1.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdHi, i32 1, i32 1, i32 1, i32 0)
 
@@ -213,23 +213,23 @@ define void @matmul_f16_chain_branch(ptr %out0, ptr %out1, <8 x float> %a, <8 x
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br i1 [[CC]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; GFX11:       if_true:
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[END:%.*]]
 ; GFX11:       if_false:
 ; GFX11-NEXT:    [[A_FALSE:%.*]] = call <8 x float> @getmat1()
 ; GFX11-NEXT:    [[B_FALSE:%.*]] = call <8 x float> @getmat1()
 ; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false)
-; GFX11-NEXT:    [[CHAIN0_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
-; GFX11-NEXT:    [[CHAIN1_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP4]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP4]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[END]]
 ; GFX11:       end:
 ; GFX11-NEXT:    [[ACCUM0_PHI:%.*]] = phi <8 x float> [ [[CHAIN0_2]], [[IF_TRUE]] ], [ [[CHAIN0_3]], [[IF_FALSE]] ]
@@ -239,22 +239,22 @@ define void @matmul_f16_chain_branch(ptr %out0, ptr %out1, <8 x float> %a, <8 x
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %cc = call i1 @getcc()
   br i1 %cc, label %if_true, label %if_false
 
 if_true:
-  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 
   br label %end
 if_false:
   %a.false = call <8 x float> @getmat1()
   %b.false = call <8 x float> @getmat1()
 
-  %chain0.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 
   br label %end
 
@@ -272,15 +272,15 @@ define void @matmul_f16_chain_diff_bbs(ptr %out0, ptr %out1, <8 x float> %a, <8
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br label [[CONT:%.*]]
 ; GFX11:       cont:
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[END:%.*]]
 ; GFX11:       end:
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN0_2]])
@@ -288,13 +288,13 @@ define void @matmul_f16_chain_diff_bbs(ptr %out0, ptr %out1, <8 x float> %a, <8
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %cc = call i1 @getcc()
   br label %cont
 cont:
-  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 
   br label %end
 end:
@@ -316,8 +316,8 @@ define void @matmul_f16_pack_loop(ptr %out0, ptr %out1) {
 ; GFX11-NEXT:    [[ACCUM1_PHI:%.*]] = phi <8 x float> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[MULADDHI:%.*]], [[LOOP]] ]
 ; GFX11-NEXT:    [[A:%.*]] = call <8 x float> @getmat1()
 ; GFX11-NEXT:    [[B:%.*]] = call <8 x float> @getmat1()
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; GFX11:       end:
@@ -341,8 +341,8 @@ loop:
 
   %accum0.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum0.phi, i32 1, i32 1, i32 0, i32 1)
   %accum1.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum1.phi, i32 1, i32 1, i32 0, i32 1)
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %accum0.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdLo, i32 1, i32 1, i32 1, i32 0)
   %accum1.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdHi, i32 1, i32 1, i32 1, i32 0)
 
@@ -360,8 +360,8 @@ define void @matmul_f16_pack_scalar_same(ptr %out0, ptr %out1, <8 x float> %a, <
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[MULADDHI]], <2 x half> <half 0xH310F, half 0xH310F>, i32 6, i32 1)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP1]], i1 false)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
@@ -370,8 +370,8 @@ define void @matmul_f16_pack_scalar_same(ptr %out0, ptr %out1, <8 x float> %a, <
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
   %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH310F, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
@@ -384,8 +384,8 @@ define void @matmul_f16_pack_scalar_different(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[MULADDHI]], <2 x half> <half 0xH310F, half 0xH3100>, i32 6, i32 1)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP1]], i1 false)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
@@ -394,8 +394,8 @@ define void @matmul_f16_pack_scalar_different(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
   %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH3100, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
@@ -408,8 +408,8 @@ define void @matmul_f16_pack_scalar_only_lo(ptr %out0, ptr %out1, <8 x float> %a
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
 ; GFX11-NEXT:    [[SCALEDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP1]], half 0xH310F, i32 1, i32 1)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALEDLO]])
@@ -418,8 +418,8 @@ define void @matmul_f16_pack_scalar_only_lo(ptr %out0, ptr %out1, <8 x float> %a
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi)
@@ -431,8 +431,8 @@ define void @matmul_f16_pack_scalar_only_hi(ptr %out0, ptr %out1, <8 x float> %a
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true)
 ; GFX11-NEXT:    [[SCALEDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP1]], half 0xH3100, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
@@ -441,8 +441,8 @@ define void @matmul_f16_pack_scalar_only_hi(ptr %out0, ptr %out1, <8 x float> %a
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH3100, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledHi)
@@ -454,8 +454,8 @@ define void @matmul_f16_pack_scalar_diff_bbs(ptr %out0, ptr %out1, <8 x float> %
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[SCALE_LO:%.*]]
 ; GFX11:       scale_lo:
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
@@ -471,8 +471,8 @@ define void @matmul_f16_pack_scalar_diff_bbs(ptr %out0, ptr %out1, <8 x float> %
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   br label %scale_lo
 
 scale_lo:
@@ -494,8 +494,8 @@ define void @matmul_f16_pack_user_between_scalar(ptr %out0, ptr %out1, <8 x floa
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[SCALE:%.*]]
 ; GFX11:       scale:
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
@@ -510,8 +510,8 @@ define void @matmul_f16_pack_user_between_scalar(ptr %out0, ptr %out1, <8 x floa
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   br label %scale
 
 scale:
@@ -531,8 +531,8 @@ define void @matmul_f16_pack_factor_between_scalar(ptr %in, ptr %out0, ptr %out1
 ; GFX11-SAME: (ptr [[IN:%.*]], ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[SCALE:%.*]]
 ; GFX11:       scale:
 ; GFX11-NEXT:    [[FACTORHI:%.*]] = load half, ptr [[IN]], align 2
@@ -547,8 +547,8 @@ define void @matmul_f16_pack_factor_between_scalar(ptr %in, ptr %out0, ptr %out1
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   br label %scale
 
 scale:
@@ -568,11 +568,11 @@ define void @matmul_f16_pack_binop_fadd(ptr %out0, ptr %out1, <8 x float> %a, <8
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]], <8 x float> [[C2:%.*]], <8 x float> [[C3:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C2]], <8 x float> [[C3]])
-; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false)
 ; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false)
 ; GFX11-NEXT:    [[BINOPLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1)
@@ -584,10 +584,10 @@ define void @matmul_f16_pack_binop_fadd(ptr %out0, ptr %out1, <8 x float> %a, <8
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %binOpLo = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3
   %binOpHi = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdHi0, <8 x float> %muladdHi1, i32 1, i32 1) #3
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpLo)
@@ -600,11 +600,11 @@ define void @matmul_f16_pack_binop_incompatible_matrices(ptr %out0, ptr %out1, <
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]], <8 x float> [[C2:%.*]], <8 x float> [[C3:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C2]], <8 x float> [[C3]])
-; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false)
 ; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false)
 ; GFX11-NEXT:    [[BINOPLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1)
@@ -616,10 +616,10 @@ define void @matmul_f16_pack_binop_incompatible_matrices(ptr %out0, ptr %out1, <
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %binOpLo = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3
   %binOpHi = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdHi0, <8 x float> %muladdHi0, i32 1, i32 1) #3
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpLo)
@@ -632,11 +632,11 @@ define void @matmul_f16_pack_binop_incompatible_arithop(ptr %out0, ptr %out1, <8
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]], <8 x float> [[C2:%.*]], <8 x float> [[C3:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C2]], <8 x float> [[C3]])
-; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false)
 ; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false)
 ; GFX11-NEXT:    [[BINOPLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1)
@@ -648,10 +648,10 @@ define void @matmul_f16_pack_binop_incompatible_arithop(ptr %out0, ptr %out1, <8
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %binOpLo = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3
   %binOpHi = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 3, <8 x float> %muladdHi0, <8 x float> %muladdHi1, i32 1, i32 1) #3
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpLo)
@@ -664,15 +664,15 @@ define void @matmul_f16_unpack_before_convert(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false)
 ; GFX11-NEXT:    [[CONVERTLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[TMP1]], i32 1, i32 1, i32 1, i32 0)
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 true)
 ; GFX11-NEXT:    [[CONVERTHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[TMP2]], i32 1, i32 1, i32 1, i32 0)
 ; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[CONVERTLO]], <8 x float> [[B]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[CONVERTHI]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[CONVERTLO]], <8 x float> [[B]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[CONVERTHI]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP4]])
 ; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 true)
@@ -680,12 +680,12 @@ define void @matmul_f16_unpack_before_convert(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %convertLo = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdLo0, i32 1, i32 1, i32 1, i32 0)
   %convertHi = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdHi0, i32 1, i32 1, i32 1, i32 0)
-  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %convertLo, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %convertHi, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %convertLo, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %convertHi, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi1)
   ret void
@@ -695,15 +695,15 @@ define void @matmul_f32_no_pack(ptr %out0, ptr %out1, <8 x float> %a, <8 x float
 ; GFX11-LABEL: define void @matmul_f32_no_pack
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C1]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2, i32 2)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C1]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2, i32 2)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> [[MULADDLO]])
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> [[MULADDHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2, i32 2)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2, i32 2)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> %muladdLo)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> %muladdHi)
   ret void
@@ -713,17 +713,17 @@ define void @matmul_f16_modified_accumulator(ptr %out0, ptr %out1, <8 x float> %
 ; GFX11-LABEL: define void @matmul_f16_modified_accumulator
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[ACCUM_C2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[MULADDLO]], <8 x float> [[C1]], i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_C2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_C2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[MULADDLO]])
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[MULADDHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %accum.c2 = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo, <8 x float> %c1, i32 1, i32 1)
-  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi)
   ret void
@@ -734,22 +734,22 @@ define void @matmul_f16_store_between_muladds(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN0_2]])
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
-  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
 }
@@ -759,22 +759,22 @@ define void @matmul_f16_store_within_chain(ptr %out0, ptr %out1, <8 x float> %a,
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 false)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
 ; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 true)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
-  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
 }
diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc
index 727138420e..9af5b0dacd 100644
--- a/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc
+++ b/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc
@@ -10,7 +10,7 @@ define <8 x float> @insert_transpose(<8 x float> %x) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[V_LOOP:%.*]] = phi <8 x float> [ [[X]], [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[F:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
@@ -28,7 +28,7 @@ loop:
 
   %f = call <8 x float> @getmat1()
   %pre.t = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %v.loop, i32 1, i32 0)
-  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre.t, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre.t, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %v.next = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %muladd, i32 1, i32 0)
 
   %cc = call i1 @getcc()
@@ -43,12 +43,12 @@ define <8 x float> @reuse_transpose(<8 x float> %x) {
 ; CHECK-LABEL: define <8 x float> @reuse_transpose
 ; CHECK-SAME: (<8 x float> [[X:%.*]]) {
 ; CHECK-NEXT:    [[T1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> [[X]], i32 1, i32 0)
-; CHECK-NEXT:    [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[T1]], <8 x float> [[X]], <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[T1]], <8 x float> [[X]], <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %t1 = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %x, i32 1, i32 0)
   %t2 = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %t1, i32 1, i32 0)
-  %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %t1, <8 x float> %t2, <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %t1, <8 x float> %t2, <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   ret <8 x float> %r
 }
 
@@ -63,7 +63,7 @@ define <8 x float> @insert_convert(ptr %ptr) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[V_LOOP:%.*]] = phi <8 x float> [ [[LOAD]], [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[F:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
@@ -81,7 +81,7 @@ loop:
 
   %f = call <8 x float> @getmat1()
   %pre = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 0, i32 1)
-  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %v.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0)
 
   %cc = call i1 @getcc()
@@ -96,12 +96,12 @@ define <8 x float> @reuse_convert(<8 x float> %x) {
 ; CHECK-LABEL: define <8 x float> @reuse_convert
 ; CHECK-SAME: (<8 x float> [[X:%.*]]) {
 ; CHECK-NEXT:    [[CVT1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[X]], i32 1, i32 1, i32 0, i32 1)
-; CHECK-NEXT:    [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[X]], <8 x float> [[X]], <8 x float> [[CVT1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[X]], <8 x float> [[X]], <8 x float> [[CVT1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %cvt1 = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %x, i32 1, i32 1, i32 0, i32 1)
   %cvt2 = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %cvt1, i32 1, i32 1, i32 1, i32 0)
-  %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %cvt2, <8 x float> %cvt2, <8 x float> %cvt1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %cvt2, <8 x float> %cvt2, <8 x float> %cvt1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   ret <8 x float> %r
 }
 
@@ -193,7 +193,7 @@ define void @convert_to_acc_inner_chain(ptr %ptr) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[V_LOOP:%.*]] = phi <8 x float> [ [[LOAD]], [[ENTRY:%.*]] ], [ [[SCALAR:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[F:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
 ; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[MULADD]], <8 x float> [[MULADD]], i32 1, i32 1)
 ; CHECK-NEXT:    [[SCALAR]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[BINOP]], half 0xH310F, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
@@ -214,7 +214,7 @@ loop:
 
   %f = call <8 x float> @getmat1()
   %pre = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 0, i32 1)
-  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 1)
   %binop = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladd, <8 x float> %muladd, i32 1, i32 1)
   %scalar = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %binop, half 0xH310F, i32 1, i32 1)
   %v.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 0)
diff --git a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
index cb41cd1279..bc1b2c82e5 100644
--- a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
+++ b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
-; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s
 
 declare void @lgc.cps.jump(i32 %target, i32 %levels, {i32} %state, ...) noreturn
 
diff --git a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
index 8aecb2319d..bc0b8750ba 100644
--- a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
-; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s
 
 declare void @lgc.cps.jump(i32, i32, { i32 }, ...) #0
 
diff --git a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
index dc9f6d1f1e..bfaeb3c10d 100644
--- a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
-; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s
 
 %_rgen_1.Frame = type { ptr addrspace(7), ptr addrspace(7), i32 }
 
diff --git a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
index 793f4bbdad..8486eac1cb 100644
--- a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
-; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s
 
 declare void @lgc.cps.jump(...) noreturn
 declare ptr addrspace(32) @lgc.cps.alloc(i32)
diff --git a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
index a2c79432d8..d057bd62b7 100644
--- a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
-; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s
 
 declare void @lgc.cps.jump(...) noreturn
 
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc
index 97b624e86e..99b85f4344 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc
@@ -11,7 +11,7 @@ define <8 x i32> @muladd_bf16_bf16(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[VALUE1]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7, i32 7)
   ret <8 x i32> %value
 }
 
@@ -23,7 +23,7 @@ define <8 x float> @muladd_bf16_f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c)
 ; CHECK-NEXT:    [[VALUE1:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <8 x float> [[C]])
 ; CHECK-NEXT:    ret <8 x float> [[VALUE1]]
 ;
-  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c, i1 false, i1 false, i1 false, i1 false, i32 2, i32 7)
+  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7, i32 2)
   ret <8 x float> %value
 }
 
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc
index 90ffa98e22..3738960bb8 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc
@@ -1,22 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 5
 ; RUN: lgc -march=amdgcn -o - --mcpu=gfx1010 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s
 
 define void @matmul_f16f32_emulator(ptr addrspace(3) %out0, <8 x float> %a, <8 x float> %b, <8 x float> %c0) !lgc.shaderstage !0 {
-; CHECK-NOT: v_dot
-  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 1)
+  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 2)
   call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value)
   ret void
 }
 
 define void @matmul_i16i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
-; CHECK-NOT: v_dot
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 4)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 4, i32 4, i32 5)
   call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
   ret void
 }
 
 define void @matmul_i8i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
-; CHECK-NOT: v_dot
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 3)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 3, i32 3, i32 5)
   call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
   ret void
 }
@@ -30,3 +28,5 @@ declare void @lgc.cooperative.matrix.store(...)
 ; Setting Threadgroup Dimensions to 64 x 1 x 1
 !llpc.compute.mode = !{!1}
 !1 = !{i32 64, i32 1, i32 1}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc
index 88292bf642..c1aca85e3d 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc
@@ -1,22 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 5
 ; RUN: lgc -march=amdgcn -o - --mcpu=gfx1011 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s
 
 define void @matmul_f16f32_emulator(ptr addrspace(3) %out0, <8 x float> %a, <8 x float> %b, <8 x float> %c0) !lgc.shaderstage !0 {
-; CHECK: v_dot2c_f32_f16
-  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 1)
+  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 2)
   call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value)
   ret void
 }
 
 define void @matmul_i16i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
-; CHECK: v_dot2_i32_i16
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 4)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 4, i32 4, i32 5)
   call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
   ret void
 }
 
 define void @matmul_i8i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
-; CHECK: v_dot4c_i32_i8
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 3)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 3, i32 3, i32 5)
   call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
   ret void
 }
@@ -31,3 +29,5 @@ declare void @lgc.cooperative.matrix.store(...)
 !llpc.compute.mode = !{!1}
 !1 = !{i32 64, i32 1, i32 1}
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc b/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc
index 6fc5998cb8..623c0f6208 100644
--- a/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc
+++ b/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 2
-; RUN: lgc --mcpu=gfx1100 -o - -passes="require<lgc-pipeline-state>,module(lgc-lower-desc),module(lgc-patch-entry-point-mutate),function(lgc-patch-buffer-op)" %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: lgc --mcpu=gfx1100 -o - -passes="require<lgc-pipeline-state>,module(lgc-lower-desc),module(lgc-mutate-entry-point),function(lgc-patch-buffer-op)" %s | FileCheck --check-prefixes=GFX11 %s
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr(<4 x i32> inreg %desc, ptr %out) {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr
diff --git a/lgc/test/WorkgroupIdOpt.lgc b/lgc/test/WorkgroupIdOpt.lgc
index 43d0088202..a33032b127 100644
--- a/lgc/test/WorkgroupIdOpt.lgc
+++ b/lgc/test/WorkgroupIdOpt.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 5
-; RUN: lgc -mcpu=gfx1100 -passes=lgc-patch-entry-point-mutate -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1100 -passes=lgc-mutate-entry-point -o - %s | FileCheck --check-prefixes=CHECK %s
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p32:32:32"
 target triple = "amdgcn--amdpal"
 
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
index 4b3903cef3..4ffc869bff 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
@@ -10,27 +10,35 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
 ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP23]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP23]], i32 -1) ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP25]], align 4, !invariant.load [[META16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP28]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
-; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP3]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP19]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP26]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP16]], <8 x i32> [[TMP20]])
 ; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
index 21c05a91d3..1a89e812f6 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
@@ -10,19 +10,27 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
 ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP17]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP17]], i32 -1) ]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP5]], align 16, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP19]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP19]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP11]], align 16, !invariant.load [[META16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float>
-; CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP3]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP20]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP8]], i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP11]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP19]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP8]], <8 x i32> [[TMP12]])
 ; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP7]], i32 15, i32 1, <8 x i32> [[TMP13]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
index 748eb7bb27..176dca5ce4 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
@@ -10,20 +10,28 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
 ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP17]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP17]], i32 -1) ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP21]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP19]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP13]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
 ; CHECK-NEXT:    ret void
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
index 29fe33a9e3..1d568c02d8 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
@@ -12,23 +12,31 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
 ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP21]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP23]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP23]], i32 -1) ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP24]], align 16, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call ptr addrspace(4) @foo1(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP8]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP10]], <4 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP11]], i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP22]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP11]], <4 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP15]], <4 x i32> [[TMP16]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP18:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP11]], <4 x float> [[TMP17]])
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
index 845b764733..81b2717e12 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
@@ -10,30 +10,38 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
 ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[DOTNOT]], label [[RET:%.*]], label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP23]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP23]], i32 -1) ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 4, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP25]], align 4, !invariant.load [[META16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP28]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
-; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP3]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP19]], align 4, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP26]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP16]], <8 x i32> [[TMP20]])
 ; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0)
 ; CHECK-NEXT:    br label [[RET]]
diff --git a/lgc/test/tanh.lgc b/lgc/test/tanh.lgc
index 977bc458fb..b21cdaf689 100644
--- a/lgc/test/tanh.lgc
+++ b/lgc/test/tanh.lgc
@@ -33,22 +33,12 @@ define float @sample(float %x) !lgc.shaderstage !1 {
   ret float %y
 }
 
-; Function Attrs: nounwind willreturn memory(read)
-declare !lgc.create.opcode !2 i32 @lgc.create.read.builtin.input.i32(...) #0
-
-; Function Attrs: nounwind willreturn memory(none)
-declare ptr addrspace(7) @lgc.load.buffer.desc(i64, i32, i32, i32) #1
-
 ; Function Attrs: nounwind memory(none)
-declare !lgc.create.opcode !3 float @lgc.create.tanh.f32(...) #2
+declare float @lgc.create.tanh.f32(...) #2
 
-attributes #0 = { nounwind willreturn memory(read) }
-attributes #1 = { nounwind willreturn memory(none) }
-attributes #2 = { nounwind memory(none) }
+attributes #0 = { nounwind memory(none) }
 
 !llpc.compute.mode = !{!0}
 
 !0 = !{i32 8, i32 8, i32 1}
 !1 = !{i32 7}
-!2 = !{i32 77}
-!3 = !{i32 17}
diff --git a/lgc/util/GfxRegHandler.cpp b/lgc/util/GfxRegHandler.cpp
index 3d764c2951..c91b353861 100644
--- a/lgc/util/GfxRegHandler.cpp
+++ b/lgc/util/GfxRegHandler.cpp
@@ -181,6 +181,9 @@ static constexpr BitsInfo SqImgRsrcRegBitsGfx10[static_cast<unsigned>(SqRsrcRegs
     {1, 30, 2},  // WidthLo
     {2, 0, 12},  // WidthHi
     {5, 0, 4},   // ArrayPitch
+    {1, 8, 12},  // MinLod
+    {},          // MinLodLo
+    {},          // MinLodHi
 };
 
 // =====================================================================================================================
@@ -204,6 +207,9 @@ static constexpr BitsInfo SqImgRsrcRegBitsGfx11[static_cast<unsigned>(SqRsrcRegs
     {1, 30, 2},  // WidthLo
     {2, 0, 12},  // WidthHi
     {5, 0, 4},   // ArrayPitch
+    {},          // MinLod
+    {5, 27, 5},  // MinLodLo
+    {6, 0, 7},   // MinLodHi
 };
 
 // =====================================================================================================================
@@ -247,6 +253,17 @@ Value *SqImgRsrcRegHandler::getReg(SqRsrcRegs regId) {
   case SqRsrcRegs::BaseArray:
   case SqRsrcRegs::ArrayPitch:
     return getRegCommon(static_cast<unsigned>(regId));
+  case SqRsrcRegs::MinLod:
+    switch (m_gfxIpVersion->major) {
+    case 10:
+      return getRegCommon(static_cast<unsigned>(regId));
+    case 11:
+      return getRegCombine(static_cast<unsigned>(SqRsrcRegs::MinLodLo), static_cast<unsigned>(SqRsrcRegs::MinLodHi));
+    default:
+      llvm_unreachable("GFX IP is not supported!");
+      break;
+    }
+    break;
   case SqRsrcRegs::Depth:
   case SqRsrcRegs::Height:
   case SqRsrcRegs::Pitch:
@@ -281,10 +298,26 @@ void SqImgRsrcRegHandler::setReg(SqRsrcRegs regId, Value *regValue) {
   case SqRsrcRegs::DstSelXYZW:
   case SqRsrcRegs::SwizzleMode:
   case SqRsrcRegs::Type:
-  case SqRsrcRegs::Depth:
   case SqRsrcRegs::BcSwizzle:
+  case SqRsrcRegs::BaseLevel:
+  case SqRsrcRegs::LastLevel:
+  case SqRsrcRegs::BaseArray:
     setRegCommon(static_cast<unsigned>(regId), regValue);
     break;
+  case SqRsrcRegs::MinLod:
+    switch (m_gfxIpVersion->major) {
+    case 10:
+      setRegCommon(static_cast<unsigned>(regId), regValue);
+      break;
+    case 11:
+      setRegCombine(static_cast<unsigned>(SqRsrcRegs::MinLodLo), static_cast<unsigned>(SqRsrcRegs::MinLodHi), regValue);
+      break;
+    default:
+      llvm_unreachable("GFX IP is not supported!");
+      break;
+    }
+    break;
+  case SqRsrcRegs::Depth:
   case SqRsrcRegs::Height:
   case SqRsrcRegs::Pitch:
     setRegCommon(static_cast<unsigned>(regId), m_builder->CreateSub(regValue, m_one));
diff --git a/lgc/util/ModuleBunch.cpp b/lgc/util/ModuleBunch.cpp
index 7a2f747f4a..0ecf4be684 100644
--- a/lgc/util/ModuleBunch.cpp
+++ b/lgc/util/ModuleBunch.cpp
@@ -77,8 +77,12 @@ bool ModuleBunch::isNormalized() const {
 /// to Module::print for each module.
 void ModuleBunch::print(raw_ostream &OS, AssemblyAnnotationWriter *AAW, bool ShouldPreserveUseListOrder,
                         bool IsForDebug) const {
-  for (const Module &M : *this)
-    M.print(OS, AAW, ShouldPreserveUseListOrder, IsForDebug);
+  for (const std::unique_ptr<Module> &M : Modules) {
+    if (!M)
+      OS << "<EMPTY SLOT IN MODULEBUNCH>\n";
+    else
+      M->print(OS, AAW, ShouldPreserveUseListOrder, IsForDebug);
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llpc/CMakeLists.txt b/llpc/CMakeLists.txt
index 748b948e98..825530be64 100644
--- a/llpc/CMakeLists.txt
+++ b/llpc/CMakeLists.txt
@@ -186,7 +186,7 @@ target_include_directories(llpcinternal
         include
         ../include
         context
-        lower
+        lowering
         translator/include
         translator/lib/SPIRV
         translator/lib/SPIRV/libSPIRV
@@ -214,28 +214,29 @@ if(ICD_BUILD_LLPC)
         context/llpcRayTracingContext.cpp
     )
 
-# llpc/lower
+# llpc/lowering
     target_sources(llpcinternal PRIVATE
-        lower/llpcSpirvLower.cpp
-        lower/LowerAccessChain.cpp
-        lower/LowerCfgMerges.cpp
-        lower/LowerConstImmediateStore.cpp
-        lower/LowerGlobals.cpp
-        lower/LowerInstMetaRemove.cpp
-        lower/LowerMath.cpp
-        lower/LowerMemoryOp.cpp
-        lower/LowerPostInline.cpp
-        lower/LowerRayTracing.cpp
-        lower/LowerTerminator.cpp
-        lower/LowerTranslator.cpp
-        lower/llpcSpirvLowerUtil.cpp
-        lower/ProcessGpuRtLibrary.cpp
-        lower/LowerInternalLibraryIntrinsic.cpp
-        lower/LowerGLCompatibility.cpp
-        lower/LowerCooperativeMatrix.cpp
-        lower/PrepareContinuations.cpp
-        lower/LowerAdvancedBlend.cpp
-        lower/ProcessGfxRuntimeLibrary.cpp
+        lowering/Lowering.cpp
+        lowering/LowerAccessChain.cpp
+        lowering/LowerCfgMerges.cpp
+        lowering/LowerConstImmediateStore.cpp
+        lowering/LowerGlobals.cpp
+        lowering/LowerInstMetaRemove.cpp
+        lowering/LowerMath.cpp
+        lowering/LowerMemoryOp.cpp
+        lowering/LowerPostInline.cpp
+        lowering/LowerRayTracing.cpp
+        lowering/LowerTerminator.cpp
+        lowering/LowerTranslator.cpp
+        lowering/LoweringUtil.cpp
+        lowering/ProcessGpuRtLibrary.cpp
+        lowering/LowerInternalLibraryIntrinsic.cpp
+        lowering/LowerGlCompatibility.cpp
+        lowering/ScalarReplacementOfBuiltins.cpp
+        lowering/LowerCooperativeMatrix.cpp
+        lowering/PrepareContinuations.cpp
+        lowering/LowerAdvancedBlend.cpp
+        lowering/ProcessGfxRuntimeLibrary.cpp
     )
 
 # llpc/translator
@@ -385,7 +386,7 @@ target_include_directories(llpc_standalone_compiler PUBLIC
     ${PROJECT_SOURCE_DIR}/../util
     ${PROJECT_SOURCE_DIR}/context
     ${PROJECT_SOURCE_DIR}/include
-    ${PROJECT_SOURCE_DIR}/lower
+    ${PROJECT_SOURCE_DIR}/lowering
     ${PROJECT_SOURCE_DIR}/tool
     ${PROJECT_SOURCE_DIR}/translator/include
     ${PROJECT_SOURCE_DIR}/translator/lib/SPIRV
diff --git a/llpc/context/llpcCompiler.cpp b/llpc/context/llpcCompiler.cpp
index 433757dac3..fb897e9acd 100644
--- a/llpc/context/llpcCompiler.cpp
+++ b/llpc/context/llpcCompiler.cpp
@@ -34,6 +34,8 @@
 #include "LowerCfgMerges.h"
 #include "LowerRayTracing.h"
 #include "LowerTranslator.h"
+#include "Lowering.h"
+#include "LoweringUtil.h"
 #include "PrepareContinuations.h"
 #include "SPIRVEntry.h"
 #include "SPIRVFunction.h"
@@ -49,8 +51,6 @@
 #include "llpcGraphicsContext.h"
 #include "llpcRayTracingContext.h"
 #include "llpcShaderModuleHelper.h"
-#include "llpcSpirvLower.h"
-#include "llpcSpirvLowerUtil.h"
 #include "llpcThreading.h"
 #include "llpcTimerProfiler.h"
 #include "llpcUtil.h"
@@ -78,6 +78,7 @@
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/Support/ErrorHandling.h"
+
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 442438
 // Old version of the code
 #else
@@ -1855,7 +1856,7 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRef<const Pipeline
       timerProfiler.addTimerStartStopPass(*lowerPassMgr, TimerTranslate, true);
 
       // SPIR-V translation, then dump the result.
-      lowerPassMgr->addPass(SpirvLowerTranslator(entryStage, shaderInfoEntry));
+      lowerPassMgr->addPass(LowerTranslator(entryStage, shaderInfoEntry));
       if (EnableOuts()) {
         lowerPassMgr->addPass(
             PrintModulePass(outs(), "\n"
@@ -2019,7 +2020,7 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRef<const Pipeline
 
 // =====================================================================================================================
 // Check shader cache for graphics pipeline, returning mask of which shader stages we want to keep in this compile.
-// This is called from the PatchCheckShaderCache pass (via a lambda in BuildPipelineInternal), to remove
+// This is called from the CheckShaderCache pass (via a lambda in BuildPipelineInternal), to remove
 // shader stages that we don't want because there was a shader cache hit.
 //
 // @param module : Module
@@ -2846,13 +2847,15 @@ Result Compiler::buildRayTracingPipelineElf(Context *context, std::unique_ptr<Mo
     assert(success);
     (void(success)); // unused
 
-    auto maxUsedPayloadRegisterCount = ContHelper::tryGetMaxUsedPayloadRegisterCount(*module).value_or(0);
+    auto moduleStateOrErr = llvmraytracing::PipelineState::fromModuleMetadata(*module);
+    if (auto err = moduleStateOrErr.takeError()) {
+      return errorToResult(std::move(err));
+    }
     {
       // Library summary in rtContext could be shared between threads, need to ensure it is only modified by one thread
       // at a time.
       std::lock_guard<sys::Mutex> lock(getHelperThreadMutex());
-      rtContext->getRayTracingLibrarySummary().maxUsedPayloadRegisterCount =
-          std::max(rtContext->getRayTracingLibrarySummary().maxUsedPayloadRegisterCount, maxUsedPayloadRegisterCount);
+      rtContext->getRayTracingLibrarySummary().llvmRaytracingState.merge(*moduleStateOrErr);
     }
   }
 
@@ -3107,7 +3110,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
     SpirvLower::registerTranslationPasses(*lowerPassMgr);
 
     // SPIR-V translation, then dump the result.
-    lowerPassMgr->addPass(SpirvLowerTranslator(shaderInfoEntry->entryStage, shaderInfoEntry));
+    lowerPassMgr->addPass(LowerTranslator(shaderInfoEntry->entryStage, shaderInfoEntry));
     lowerPassMgr->addPass(LowerCfgMerges());
     lowerPassMgr->addPass(AlwaysInlinerPass());
 
@@ -3312,8 +3315,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
   // Build traversal at last after we gather all needed information.
   if (traversalModule) {
     if (isContinuationsMode)
-      ContHelper::setPreservedPayloadRegisterCount(*traversalModule,
-                                                   rtContext.getRayTracingLibrarySummary().maxUsedPayloadRegisterCount);
+      rtContext.getRayTracingLibrarySummary().llvmRaytracingState.exportModuleMetadata(*traversalModule);
 
     auto rayFlagsKnownBits = rtContext.getRayFlagsKnownBits();
     lgc::gpurt::setKnownSetRayFlags(*traversalModule, rayFlagsKnownBits.One.getZExtValue());
diff --git a/llpc/context/llpcComputeContext.h b/llpc/context/llpcComputeContext.h
index 7dccf683bc..f554ff9f9d 100644
--- a/llpc/context/llpcComputeContext.h
+++ b/llpc/context/llpcComputeContext.h
@@ -40,7 +40,7 @@ class ComputeContext : public PipelineContext {
 public:
   ComputeContext(GfxIpVersion gfxIp, const ComputePipelineBuildInfo *pipelineInfo, MetroHash::Hash *pipelineHash,
                  MetroHash::Hash *cacheHash);
-  virtual ~ComputeContext() {}
+  virtual ~ComputeContext() = default;
 
   virtual PipelineType getPipelineType() const override { return PipelineType::Compute; }
 
diff --git a/llpc/context/llpcContext.cpp b/llpc/context/llpcContext.cpp
index c1476c6411..661fb3f60d 100644
--- a/llpc/context/llpcContext.cpp
+++ b/llpc/context/llpcContext.cpp
@@ -34,13 +34,13 @@
 #include "LowerCfgMerges.h"
 #include "LowerGlobals.h"
 #include "LowerTranslator.h"
+#include "Lowering.h"
 #include "ProcessGfxRuntimeLibrary.h"
 #include "ProcessGpuRtLibrary.h"
 #include "SPIRVInternal.h"
 #include "llpcCompiler.h"
 #include "llpcDebug.h"
 #include "llpcPipelineContext.h"
-#include "llpcSpirvLower.h"
 #include "llpcTimerProfiler.h"
 #include "vkgcMetroHash.h"
 #include "gfxruntime/GfxRuntimeLibrary.h"
@@ -100,10 +100,6 @@ Context::Context(GfxIpVersion gfxIp) : LLVMContext(), m_gfxIp(gfxIp) {
   reset();
 }
 
-// =====================================================================================================================
-Context::~Context() {
-}
-
 // =====================================================================================================================
 void Context::reset() {
   m_pipelineContext = nullptr;
@@ -127,7 +123,6 @@ LgcContext *Context::getLgcContext() {
     lgc::GpurtContext::get(*this).theModule = nullptr;
     lgc::GpurtContext::get(*this).ownedTheModule.reset();
     lgc::GfxRuntimeContext::get(*this).theModule.reset();
-
     // Pass the state of LLPC_OUTS on to LGC.
     LgcContext::setLlpcOuts(EnableOuts() ? &outs() : nullptr);
   }
@@ -273,7 +268,7 @@ void Context::ensureGpurtLibrary() {
 
   timerProfiler.addTimerStartStopPass(*lowerPassMgr, TimerTranslate, true);
 
-  lowerPassMgr->addPass(SpirvLowerTranslator(ShaderStageCompute, &shaderInfo, "_gpurtvar_"));
+  lowerPassMgr->addPass(LowerTranslator(ShaderStageCompute, &shaderInfo, "_gpurtvar_"));
   if (EnableOuts()) {
     lowerPassMgr->addPass(
         PrintModulePass(outs(), "\n"
@@ -335,7 +330,7 @@ void Context::ensureGfxRuntimeLibrary() {
 
   timerProfiler.addTimerStartStopPass(*lowerPassMgr, TimerTranslate, true);
 
-  lowerPassMgr->addPass(SpirvLowerTranslator(ShaderStageCompute, &shaderInfo));
+  lowerPassMgr->addPass(LowerTranslator(ShaderStageCompute, &shaderInfo));
   if (EnableOuts()) {
     lowerPassMgr->addPass(
         PrintModulePass(outs(), "\n"
diff --git a/llpc/context/llpcContext.h b/llpc/context/llpcContext.h
index ebc730a825..87ff860b3f 100644
--- a/llpc/context/llpcContext.h
+++ b/llpc/context/llpcContext.h
@@ -51,7 +51,6 @@ namespace Llpc {
 class Context : public llvm::LLVMContext {
 public:
   Context(GfxIpVersion gfxIp);
-  ~Context();
 
   void reset();
 
diff --git a/llpc/context/llpcGraphicsContext.cpp b/llpc/context/llpcGraphicsContext.cpp
index 762981b296..18492fd680 100644
--- a/llpc/context/llpcGraphicsContext.cpp
+++ b/llpc/context/llpcGraphicsContext.cpp
@@ -88,10 +88,6 @@ GraphicsContext::GraphicsContext(GfxIpVersion gfxIp, const GraphicsPipelineBuild
   m_pipelineApiHash = pipelineInfo->pipelineApiHash;
 }
 
-// =====================================================================================================================
-GraphicsContext::~GraphicsContext() {
-}
-
 // =====================================================================================================================
 // Gets pipeline shader info of the specified shader stage
 //
diff --git a/llpc/context/llpcGraphicsContext.h b/llpc/context/llpcGraphicsContext.h
index 247cf318f2..ac5515ca78 100644
--- a/llpc/context/llpcGraphicsContext.h
+++ b/llpc/context/llpcGraphicsContext.h
@@ -42,7 +42,7 @@ class GraphicsContext : public PipelineContext {
 public:
   GraphicsContext(GfxIpVersion gfxIp, const GraphicsPipelineBuildInfo *pipelineInfo, MetroHash::Hash *pipelineHash,
                   MetroHash::Hash *cacheHash);
-  virtual ~GraphicsContext();
+  virtual ~GraphicsContext() = default;
 
   virtual PipelineType getPipelineType() const override { return PipelineType::Graphics; }
 
diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp
index 21ba35235f..6799fd4ae1 100644
--- a/llpc/context/llpcPipelineContext.cpp
+++ b/llpc/context/llpcPipelineContext.cpp
@@ -144,10 +144,6 @@ PipelineContext::PipelineContext(GfxIpVersion gfxIp, MetroHash::Hash *pipelineHa
     : m_gfxIp(gfxIp), m_pipelineHash(*pipelineHash), m_cacheHash(*cacheHash) {
 }
 
-// =====================================================================================================================
-PipelineContext::~PipelineContext() {
-}
-
 // =====================================================================================================================
 // Gets the hash code of input shader with specified shader stage.
 //
@@ -202,7 +198,7 @@ void PipelineContext::setRayTracingState(const Vkgc::RtState &rtState, const Vkg
     m_rtState.rtIpVersion = Vkgc::gpurt::getRtIpVersion(m_gfxIp);
 
   if (m_rtState.rtIpVersion.major != 0 && !m_rtState.gpurtOverride) {
-    gpurt::getShaderLibrarySpirv(m_rtState.gpurtFeatureFlags, m_rtState.gpurtShaderLibrary.pCode,
+    gpurt::getShaderLibrarySpirv(m_rtState.rtIpVersion, m_rtState.gpurtFeatureFlags, m_rtState.gpurtShaderLibrary.pCode,
                                  m_rtState.gpurtShaderLibrary.codeSize);
     gpurt::getFuncTable(m_rtState.rtIpVersion, m_rtState.gpurtFuncTable);
   }
@@ -401,7 +397,7 @@ void PipelineContext::convertResourceNode(ResourceNode &dst, const ResourceMappi
   dst.sizeInDwords = src.sizeInDwords;
   dst.offsetInDwords = src.offsetInDwords;
   dst.abstractType = ResourceNodeType::Unknown;
-  dst.visibility = visibility;
+  dst.visibility = ShaderStageMask::fromRaw(visibility);
 
   switch (src.type) {
   case ResourceMappingNodeType::DescriptorTableVaPtr: {
diff --git a/llpc/context/llpcPipelineContext.h b/llpc/context/llpcPipelineContext.h
index b00ed8a898..0a3d7caa4b 100644
--- a/llpc/context/llpcPipelineContext.h
+++ b/llpc/context/llpcPipelineContext.h
@@ -119,7 +119,7 @@ enum class PipelineType {
 class PipelineContext {
 public:
   PipelineContext(GfxIpVersion gfxIp, MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash);
-  virtual ~PipelineContext();
+  virtual ~PipelineContext() = default;
 
   // Returns the pipeline type
   virtual PipelineType getPipelineType() const = 0;
diff --git a/llpc/context/llpcRayTracingContext.h b/llpc/context/llpcRayTracingContext.h
index c520c81854..12b77904ca 100644
--- a/llpc/context/llpcRayTracingContext.h
+++ b/llpc/context/llpcRayTracingContext.h
@@ -49,7 +49,7 @@ class RayTracingContext : public PipelineContext {
   RayTracingContext(GfxIpVersion gfxIp, const RayTracingPipelineBuildInfo *pipelineInfo,
                     const PipelineShaderInfo *representativeShaderInfo, MetroHash::Hash *pipelineHash,
                     MetroHash::Hash *cacheHash, unsigned indirectStageMask);
-  virtual ~RayTracingContext() {}
+  virtual ~RayTracingContext() = default;
 
   virtual PipelineType getPipelineType() const override { return PipelineType::RayTracing; }
 
diff --git a/llpc/docs/DdnBindlessTexture.md b/llpc/docs/DdnBindlessTexture.md
index 30f506efde..f55908467a 100644
--- a/llpc/docs/DdnBindlessTexture.md
+++ b/llpc/docs/DdnBindlessTexture.md
@@ -371,7 +371,7 @@ void main()
 ...
 ```
 
-After the above change, we can see the pipeline dumps for the above shader, the pass “LLPC translate SPIR-V binary to LLVM IR”  and the ISA code dump looks as following, the cases that declare bindless textures by as uniform uvec2 type can run correctly.
+After the above change, we can see the pipeline dumps for the above shader, the pass "LLPC translate SPIR-V binary to LLVM IR" and the ISA code dump looks as following, the cases that declare bindless textures by as uniform uvec2 type can run correctly.
 
 ![](./DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG)
 
diff --git a/llpc/docs/DdnInterShaderDataCacheTracking.md b/llpc/docs/DdnInterShaderDataCacheTracking.md
index f46d1d6278..6d5aaac07c 100644
--- a/llpc/docs/DdnInterShaderDataCacheTracking.md
+++ b/llpc/docs/DdnInterShaderDataCacheTracking.md
@@ -22,7 +22,7 @@ Historical Background
 
 At the time of writing, LLPC already uses variant 2 described in the introduction. It does so by combining various
 pieces of information that have been saved off in a look-aside data structure (`ResourceUsage`) in the
-`PatchCheckShaderCache` pass.
+`CheckShaderCache` pass.
 
 This approach has two downsides:
 
@@ -30,7 +30,7 @@ This approach has two downsides:
      the information may otherwise be obsolete.
 
   2. Knowledge about what kind of full pipeline optimizations are applied, including some of their details, is
-     centralized in `PatchCheckShaderCache`. This limits the design's extensibility.
+     centralized in `CheckShaderCache`. This limits the design's extensibility.
 
 Implementation overview
 -----------------------
@@ -39,7 +39,7 @@ Every function in LLPC gets an attached `!llpc.hash` metadata node.
 This metadata node is initialized with the relevant input shader hash.
 Every pass that performs inter-shader transforms updates the metadata by hashing the old hash together with any data
 that is relevant from other shaders, i.e. it computes `h_new = h(h_old | inter-shader data)`.
-The metadata node is finally inspected in the `PatchCheckShaderCache` pass.
+The metadata node is finally inspected in the `CheckShaderCache` pass.
 
 Extensible specialized metadata
 -------------------------------
diff --git a/llpc/include/llpc.h b/llpc/include/llpc.h
index f74d043222..86eb8986de 100644
--- a/llpc/include/llpc.h
+++ b/llpc/include/llpc.h
@@ -220,7 +220,7 @@ class IShaderCache {
   IShaderCache() {}
 
   /// @internal Destructor. Prevent use of delete operator on this interface.
-  virtual ~IShaderCache() {}
+  virtual ~IShaderCache() = default;
 };
 #endif
 
@@ -363,7 +363,7 @@ class ICompiler {
 protected:
   ICompiler() {}
   /// Destructor
-  virtual ~ICompiler() {}
+  virtual ~ICompiler() = default;
 };
 
 } // namespace Llpc
diff --git a/llpc/lower/LowerAccessChain.cpp b/llpc/lowering/LowerAccessChain.cpp
similarity index 100%
rename from llpc/lower/LowerAccessChain.cpp
rename to llpc/lowering/LowerAccessChain.cpp
diff --git a/llpc/lower/LowerAccessChain.h b/llpc/lowering/LowerAccessChain.h
similarity index 99%
rename from llpc/lower/LowerAccessChain.h
rename to llpc/lowering/LowerAccessChain.h
index 4d551b91b6..58fd43d875 100644
--- a/llpc/lower/LowerAccessChain.h
+++ b/llpc/lowering/LowerAccessChain.h
@@ -30,7 +30,7 @@
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
diff --git a/llpc/lower/LowerAdvancedBlend.cpp b/llpc/lowering/LowerAdvancedBlend.cpp
similarity index 100%
rename from llpc/lower/LowerAdvancedBlend.cpp
rename to llpc/lowering/LowerAdvancedBlend.cpp
diff --git a/llpc/lower/LowerAdvancedBlend.h b/llpc/lowering/LowerAdvancedBlend.h
similarity index 98%
rename from llpc/lower/LowerAdvancedBlend.h
rename to llpc/lowering/LowerAdvancedBlend.h
index 61412ecada..3a1af04f3d 100644
--- a/llpc/lower/LowerAdvancedBlend.h
+++ b/llpc/lowering/LowerAdvancedBlend.h
@@ -30,7 +30,7 @@
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/PassManager.h"
 
diff --git a/llpc/lower/LowerCfgMerges.cpp b/llpc/lowering/LowerCfgMerges.cpp
similarity index 99%
rename from llpc/lower/LowerCfgMerges.cpp
rename to llpc/lowering/LowerCfgMerges.cpp
index faba98c13b..e293b81caa 100644
--- a/llpc/lower/LowerCfgMerges.cpp
+++ b/llpc/lowering/LowerCfgMerges.cpp
@@ -36,11 +36,11 @@
  ***********************************************************************************************************************
  */
 #include "LowerCfgMerges.h"
+#include "Lowering.h"
+#include "LoweringUtil.h"
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
 #include "llpcDebug.h"
-#include "llpcSpirvLower.h"
-#include "llpcSpirvLowerUtil.h"
 #include "lgc/Builder.h"
 #include "lgc/LgcDialect.h"
 #include "llvm/ADT/DepthFirstIterator.h"
diff --git a/llpc/lower/LowerCfgMerges.h b/llpc/lowering/LowerCfgMerges.h
similarity index 98%
rename from llpc/lower/LowerCfgMerges.h
rename to llpc/lowering/LowerCfgMerges.h
index 0cdba4218e..2e22ac5328 100644
--- a/llpc/lower/LowerCfgMerges.h
+++ b/llpc/lowering/LowerCfgMerges.h
@@ -30,7 +30,7 @@
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/PassManager.h"
 
diff --git a/llpc/lower/LowerConstImmediateStore.cpp b/llpc/lowering/LowerConstImmediateStore.cpp
similarity index 95%
rename from llpc/lower/LowerConstImmediateStore.cpp
rename to llpc/lowering/LowerConstImmediateStore.cpp
index 4c0ae2502e..2311e304fd 100644
--- a/llpc/lower/LowerConstImmediateStore.cpp
+++ b/llpc/lowering/LowerConstImmediateStore.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerConstImmediateStore.cpp
- * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerConstImmediateStore.
+ * @brief LLPC source file: contains implementation of class Llpc::LowerConstImmediateStore.
  ***********************************************************************************************************************
  */
 #include "LowerConstImmediateStore.h"
@@ -38,7 +38,7 @@
 #include "llvm/Support/Debug.h"
 #include <vector>
 
-#define DEBUG_TYPE "llpc-spirv-lower-const-immediate-store"
+#define DEBUG_TYPE "lower-const-immediate-store"
 
 using namespace llvm;
 using namespace SPIRV;
@@ -51,8 +51,8 @@ namespace Llpc {
 //
 // @param [in/out] module : LLVM module to be run on (empty on entry)
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-PreservedAnalyses SpirvLowerConstImmediateStore::run(Module &module, ModuleAnalysisManager &analysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Const-Immediate-Store\n");
+PreservedAnalyses LowerConstImmediateStore::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Const-Immediate-Store\n");
 
   SpirvLower::init(&module);
 
@@ -75,7 +75,7 @@ PreservedAnalyses SpirvLowerConstImmediateStore::run(Module &module, ModuleAnaly
 // can be optimized to a read-only global variable.
 //
 // @param func : Function to process
-bool SpirvLowerConstImmediateStore::processAllocaInsts(Function *func) {
+bool LowerConstImmediateStore::processAllocaInsts(Function *func) {
   // NOTE: We only visit the entry block on the basis that SPIR-V translator puts all "alloca"
   // instructions there.
   bool changed = false;
@@ -98,7 +98,7 @@ bool SpirvLowerConstImmediateStore::processAllocaInsts(Function *func) {
 //
 // @param allocaInst : The "alloca" instruction to process
 // @return true if the alloca was replaced
-bool SpirvLowerConstImmediateStore::tryProcessAlloca(AllocaInst *allocaInst) {
+bool LowerConstImmediateStore::tryProcessAlloca(AllocaInst *allocaInst) {
   // LLVM IR allocas can have an "arrayness" where multiple elements of the allocated type are allocated at once.
   // SPIR-V doesn't have this (because it only has OpVariable and not a "true" alloca), but let's guard against it
   // anyway just in case.
diff --git a/llpc/lower/LowerConstImmediateStore.h b/llpc/lowering/LowerConstImmediateStore.h
similarity index 93%
rename from llpc/lower/LowerConstImmediateStore.h
rename to llpc/lowering/LowerConstImmediateStore.h
index 2df8c0b1c4..ba7828e83f 100644
--- a/llpc/lower/LowerConstImmediateStore.h
+++ b/llpc/lowering/LowerConstImmediateStore.h
@@ -25,12 +25,12 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerConstImmediateStore.h
- * @brief LLPC header file: contains declaration of class Llpc::SpirvLowerConstImmediateStore.
+ * @brief LLPC header file: contains declaration of class Llpc::LowerConstImmediateStore.
  ***********************************************************************************************************************
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
@@ -43,7 +43,7 @@ namespace Llpc {
 
 // =====================================================================================================================
 // Represents the pass of SPIR-V lowering operations for constant immediate store
-class SpirvLowerConstImmediateStore : public SpirvLower, public llvm::PassInfoMixin<SpirvLowerConstImmediateStore> {
+class LowerConstImmediateStore : public SpirvLower, public llvm::PassInfoMixin<LowerConstImmediateStore> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/llpc/lower/LowerCooperativeMatrix.cpp b/llpc/lowering/LowerCooperativeMatrix.cpp
similarity index 98%
rename from llpc/lower/LowerCooperativeMatrix.cpp
rename to llpc/lowering/LowerCooperativeMatrix.cpp
index 432f8279c4..5f501eb987 100644
--- a/llpc/lower/LowerCooperativeMatrix.cpp
+++ b/llpc/lowering/LowerCooperativeMatrix.cpp
@@ -39,7 +39,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 
-#define DEBUG_TYPE "llpc-spirv-lower-cooperative-matrix"
+#define DEBUG_TYPE "lower-cooperative-matrix"
 
 using namespace llvm;
 using namespace lgc;
@@ -166,7 +166,7 @@ void LowerCooperativeMatrix::visitPointerUsers(Value *ptr, CooperativeMatrixElem
 //
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-PreservedAnalyses SpirvLowerCooperativeMatrix::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses LowerCooperativeMatrixProxy::run(Module &module, ModuleAnalysisManager &analysisManager) {
   LowerCooperativeMatrix impl{module};
   return impl.run();
 }
diff --git a/llpc/lower/LowerCooperativeMatrix.h b/llpc/lowering/LowerCooperativeMatrix.h
similarity index 92%
rename from llpc/lower/LowerCooperativeMatrix.h
rename to llpc/lowering/LowerCooperativeMatrix.h
index ea15854624..52d60e1b47 100644
--- a/llpc/lower/LowerCooperativeMatrix.h
+++ b/llpc/lowering/LowerCooperativeMatrix.h
@@ -36,11 +36,11 @@ namespace Llpc {
 
 // =====================================================================================================================
 // Pass that lower SPIR-V-specific cooperative matrix operations
-class SpirvLowerCooperativeMatrix : public llvm::PassInfoMixin<SpirvLowerCooperativeMatrix> {
+class LowerCooperativeMatrixProxy : public llvm::PassInfoMixin<LowerCooperativeMatrixProxy> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
-  static llvm::StringRef name() { return "spirv-lower-cooperative-matrix"; }
+  static llvm::StringRef name() { return "lower-cooperative-matrix"; }
 };
 
 } // namespace Llpc
diff --git a/llpc/lowering/LowerExecutionGraph.cpp b/llpc/lowering/LowerExecutionGraph.cpp
new file mode 100644
index 0000000000..a6d3de614c
--- /dev/null
+++ b/llpc/lowering/LowerExecutionGraph.cpp
@@ -0,0 +1,986 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  LowerExecutionGraph.cpp
+ * @brief LLPC source file: contains implementation of class Llpc::LowerExecutionGraph.
+ ***********************************************************************************************************************
+ */
+#include "LowerExecutionGraph.h"
+#include "SPIRVInternal.h"
+#include "llpcContext.h"
+#include "llpcExecutionGraphContext.h"
+#include "compilerutils/CompilerUtils.h"
+#include "compilerutils/TypeLowering.h"
+#include "lgc/Builder.h"
+#include "lgc/BuiltIns.h"
+#include "lgc/LgcDialect.h"
+#include "lgc/LgcWgDialect.h"
+#include "lgc/Pipeline.h"
+#include "lgc/RuntimeContext.h"
+#include "llvm/IR/DerivedUser.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "lower-execution-graph"
+
+using namespace CompilerUtils;
+using namespace lgc;
+using namespace lgc::wg;
+using namespace llvm;
+using namespace Llpc;
+using namespace spv;
+using namespace llvm_dialects;
+
+namespace SPIRV {
+extern const char *MetaNameSpirvOp;
+} // namespace SPIRV
+
+using namespace SPIRV;
+namespace WorkGraphFunc {
+enum : unsigned {
+  ShaderPreamble = 0,            // Preamble function
+  ShaderPostamble,               // Postamble function
+  OutputAllocate,                // Allocates output records for a successor node.
+  OutputCommit,                  // Commits previously-allocated output records for a successor node.
+  OutputGetPayload,              // Retrieves the GPU VA of a specific output payload entry
+  InputGetPayloadCount,          // Retrieves the input payload count.
+  InputGetPayloadAtIndex,        // Retrieves the GPU address for an input payload at the specified index
+  WorkgroupId,                   // Workgroup ID
+  GlobalThreadId,                // Global Thread ID,
+  ShaderEmptyInputPreamble,      // Empty input preamble
+  IncrementEmptyOutputCount,     // Empty output count
+  InitCrossGroupSharing,         // Init cross group sharing
+  FinishCrossGroupSharing,       // Finish cross group sharing
+  IsOutputNodePresent,           // Checks if an output node is valid
+  GetRemainingRecursionDepth,    // Get remaining recursion depth
+  IsThreadLaunchInvocationValid, // Is ThreadLaunch Invocation Valid
+  Count
+};
+} // namespace WorkGraphFunc
+
+const char *WorkGraphNames[] = {
+    "AmdWorkGraphsShaderPreamble",                // ShaderPreamble
+    "AmdWorkGraphsShaderPostamble",               // ShaderPostamble
+    "AmdWorkGraphsOutputAllocate",                // OutputAllocate
+    "AmdWorkGraphsOutputCommit",                  // OutputCommit
+    "AmdWorkGraphsOutputGetPayload",              // OutputGetPayload
+    "AmdWorkGraphsInputGetPayloadCount",          // InputGetPayloadCount
+    "AmdWorkGraphsInputGetPayloadAtIndex",        // InputGetPayloadAtIndex
+    "AmdWorkGraphsGroupId",                       // WorkgroupId
+    "AmdWorkGraphsGlobalThreadId",                // GlobalThreadId
+    "AmdWorkGraphsShaderEmptyInputPreamble",      // Empty input preamble
+    "AmdWorkGraphsIncrementEmptyOutputCount",     // Empty output count
+    "AmdWorkGraphsInitCrossGroupSharing",         // Init cross group sharing
+    "AmdWorkGraphsFinishCrossGroupSharing",       // Finish cross group sharing
+    "AmdWorkGraphsIsOutputNodePresent",           // Checks if an output node is valid
+    "AmdWorkGraphsGetRemainingRecursionDepth",    // Current graphs recursion depth
+    "AmdWorkGraphsIsThreadLaunchInvocationValid", // Is ThreadLaunch Invocation Valid
+};
+
+static const char *OutputArgNames[] = {"ShaderState", "Scope", "OutputIdx", "ArrayIdx", "Count"};
+static const char *EntryFuncName = "shader"; // Execution graph entry name
+const char *WorkgraphOutputCount = "WorkgraphOutputCount";
+const char *WorkgraphGetLds = "WorkgraphGetLds";
+
+namespace {
+
+struct LoweringVisitorPayload {
+  Llpc::LowerExecutionGraph &pass;
+  TypeLowering typeLower;
+
+  explicit LoweringVisitorPayload(Type *payloadArrayPtrType, Llpc::LowerExecutionGraph &pass)
+      : pass(pass), typeLower(payloadArrayPtrType->getContext()) {
+    typeLower.addRule([payloadArrayPtrType](TypeLowering &, Type * type) -> auto {
+      SmallVector<Type *> lowered;
+      auto &context = type->getContext();
+      if (type->isPointerTy() && type->getPointerAddressSpace() == SPIRAS_PayloadArray) {
+        lowered.push_back(PointerType::get(context, SPIRAS_Private));
+        lowered.push_back(payloadArrayPtrType);
+      } else if (isPayloadType(type)) {
+        lowered.push_back(payloadArrayPtrType);
+      }
+      return lowered;
+    });
+    typeLower.addConstantRule([](TypeLowering &, Constant * c, ArrayRef<Type *> loweredTypes) -> auto {
+      SmallVector<Constant *> lowered;
+      if (auto *gv = dyn_cast<GlobalVariable>(c)) {
+        if (gv->getAddressSpace() == SPIRAS_PayloadArray) {
+          // Stand-in for an input payload array. We don't actually need the value for anything.
+          lowered.push_back(PoisonValue::get(loweredTypes[0]));
+        }
+      }
+      return lowered;
+    });
+  }
+};
+
+} // anonymous namespace
+
+template <> struct llvm_dialects::VisitorPayloadProjection<LoweringVisitorPayload, Llpc::LowerExecutionGraph> {
+  static Llpc::LowerExecutionGraph &project(LoweringVisitorPayload &payload) { return payload.pass; }
+};
+
+LLVM_DIALECTS_VISITOR_PAYLOAD_PROJECT_FIELD(LoweringVisitorPayload, typeLower)
+
+namespace Llpc {
+
+static constexpr unsigned MaxGridCount = 65535; // Max dispatch grid count
+
+// =====================================================================================================================
+LowerExecutionGraph::LowerExecutionGraph(Pipeline *pipeline)
+    : m_pipeline(pipeline), m_graphLds(nullptr), m_threadLaunch(false) {
+  for (unsigned i = 0; i < WorkGraphFunc::Count; ++i) {
+    m_workGraphLibFuncNames[WorkGraphNames[i]] = i;
+  }
+}
+
+// =====================================================================================================================
+// Executes this SPIR-V lowering pass on the specified LLVM module.
+//
+// @param [in/out] module : LLVM module to be run on (empty on entry)
+// @param [in/out] analysisManager : Analysis manager to use for this transformation
+PreservedAnalyses LowerExecutionGraph::run(Module &module, ModuleAnalysisManager &analysisManager) {
+
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-execution-graph\n");
+  SpirvLower::init(&module);
+
+  auto &graphContext = ExeGraphRuntimeContext::get(module.getContext());
+  const auto graphLibModule = graphContext.theModule;
+  if (!graphLibModule)
+    return PreservedAnalyses::all();
+
+  m_graphLibFuncs.resize(WorkGraphFunc::Count);
+  for (unsigned i = 0; i < WorkGraphFunc::Count; ++i) {
+    Function *func = graphLibModule->getFunction(WorkGraphNames[i]);
+    m_graphLibFuncs[i] = func;
+  }
+
+  m_payloadArrayPtrType = getOutputRecordsTy();
+
+  m_metaEnqueueId = m_context->getMDKindID(lgc::wg::ShaderEnqueue);
+  MDNode *modeMetadata = m_entryPoint->getMetadata(m_metaEnqueueId);
+  if (!modeMetadata)
+    return PreservedAnalyses::none();
+
+  m_entryPoint->setName(EntryFuncName);
+  m_entryPoint->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
+  m_entryPoint->setLinkage(GlobalValue::ExternalLinkage);
+
+  assert(modeMetadata->getNumOperands() == std::size(m_enqueueModes.U32All) + 1); // +1 for inputSharedWithName
+  unsigned ndx;
+  for (ndx = 0; ndx < std::size(m_enqueueModes.U32All); ++ndx) {
+    auto metaOp = cast<ConstantAsMetadata>(modeMetadata->getOperand(ndx));
+    m_enqueueModes.U32All[ndx] = cast<ConstantInt>(metaOp->getValue())->getZExtValue();
+  }
+  m_inputSharedWithName = cast<MDString>(modeMetadata->getOperand(ndx))->getString();
+  TypeLowering typeLower(*m_context);
+  m_typeLowering = &typeLower;
+  const auto funcVisitor = llvm_dialects::VisitorBuilder<LowerExecutionGraph>()
+                               .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                               .add(&LowerExecutionGraph::visitRegisterOutputNode)
+                               .build();
+  funcVisitor.visit(*this, module);
+  m_typeLowering->finishCleanup();
+
+  initInputPayloadInfo(m_enqueueModes);
+  m_builder->SetInsertPointPastAllocas(m_entryPoint);
+  initAllocVariables(m_builder);
+  // Call ShaderPreamble
+  // NOTE: according to the PAL comment notes to the EmptyInputPreamble, for dynamic dispatch workgroup, implied by
+  // the MaxNumWorkgroupsAMDX is not zero, dynamic expansion nodes cannot have zero-byte payloads because the grid
+  // size is 12 bytes.
+  CrossModuleInliner inliner;
+  auto gprsVariable =
+      (m_inputPayloadInfo.payloadSize == 0 && m_enqueueModes.modes.maxNumWorkgroupsX == 0 &&
+       m_enqueueModes.modes.maxNumWorkgroupsY == 0)
+          ? inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::ShaderEmptyInputPreamble], {}).returnValue
+          : inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::ShaderPreamble], {}).returnValue;
+
+  // Keep the gprs variable from ShaderPreamble call
+  m_builder->CreateStore(gprsVariable, m_outputAllocateArgs[OutputAllocateArg::ShaderState]);
+
+  // Create input counts number
+  auto inputsCount = inliner
+                         .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::InputGetPayloadCount],
+                                     {m_outputAllocateArgs[OutputAllocateArg::ShaderState]})
+                         .returnValue;
+  m_builder->CreateStore(inputsCount, m_builtInVariables[WorkGraphBuiltIns::CoalescedInputCount]);
+
+  auto remaining = inliner
+                       .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::GetRemainingRecursionDepth],
+                                   {m_outputAllocateArgs[OutputAllocateArg::ShaderState]})
+                       .returnValue;
+  m_builder->CreateStore(remaining, m_builtInVariables[WorkGraphBuiltIns::RemainingRecursionLevels]);
+
+  unsigned shaderIndex =
+      m_inputPayloadInfo.arrayIndex != InvalidValue ? m_inputPayloadInfo.arrayIndex : m_enqueueModes.modes.shaderIndex;
+  m_builder->CreateStore(m_builder->getInt32(shaderIndex), m_builtInVariables[WorkGraphBuiltIns::ShaderIndex]);
+  auto shaderMode = Pipeline::getComputeShaderMode(module);
+  m_threadLaunch = isThreadLaunchNode(shaderMode, m_enqueueModes, m_inputPayloadInfo);
+  auto zero = m_builder->getInt32(0);
+  auto constVec = ConstantVector::get({zero, zero, zero});
+
+  if (m_threadLaunch) {
+    auto valid =
+        cast<Instruction>(inliner
+                              .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::IsThreadLaunchInvocationValid],
+                                          {m_outputAllocateArgs[OutputAllocateArg::ShaderState]})
+                              .returnValue);
+    auto nextPos = valid->getNextNode();
+    Instruction *terminator = SplitBlockAndInsertIfElse(valid, m_builder->GetInsertPoint(), false);
+    m_builder->SetInsertPoint(terminator);
+    m_builder->CreateRetVoid();
+    terminator->eraseFromParent();
+
+    m_builder->SetInsertPoint(nextPos);
+    m_localInvocationIndex =
+        new GlobalVariable(*m_module, zero->getType(), false, GlobalVariable::ExternalLinkage, nullptr, "localIndex",
+                           nullptr, GlobalValue::NotThreadLocal, SPIRAS_Private);
+    m_builder->CreateStore(zero, m_localInvocationIndex);
+    shaderMode.workgroupSizeX = 32;
+    Pipeline::setComputeShaderMode(module, shaderMode);
+  }
+  if (m_enqueueModes.modes.isCoalescing) {
+    // Create WorkgroupId
+    m_builder->CreateStore(constVec, m_builtInVariables[WorkGraphBuiltIns::WorkgroupId]);
+    // Create GlobalInvocationId
+    Value *localInvocationId =
+        m_threadLaunch
+            ? constVec
+            : m_builder->CreateReadBuiltInInput(static_cast<lgc::BuiltInKind>(lgc::BuiltInLocalInvocationId));
+    m_builder->CreateStore(localInvocationId, m_builtInVariables[WorkGraphBuiltIns::GlobalInvocationId]);
+
+  } else {
+    // Create WorkgroupId
+    auto workGroupId = inliner
+                           .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::WorkgroupId],
+                                       {m_outputAllocateArgs[OutputAllocateArg::ShaderState]})
+                           .returnValue;
+    m_builder->CreateStore(workGroupId, m_builtInVariables[WorkGraphBuiltIns::WorkgroupId]);
+
+    // Create GlobalInvocationId
+    auto globalInvocationId = inliner
+                                  .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::GlobalThreadId],
+                                              {m_outputAllocateArgs[OutputAllocateArg::ShaderState]})
+                                  .returnValue;
+    m_builder->CreateStore(globalInvocationId, m_builtInVariables[WorkGraphBuiltIns::GlobalInvocationId]);
+  }
+
+  SmallVector<Instruction *, 4> rets;
+  getFuncRets(m_entryPoint, rets);
+  for (auto ret : rets) {
+    m_builder->SetInsertPoint(ret);
+    inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::ShaderPostamble],
+                       {m_outputAllocateArgs[OutputAllocateArg::ShaderState]});
+  }
+  LoweringVisitorPayload payload(m_payloadArrayPtrType, *this);
+  m_typeLowering = &payload.typeLower;
+  static const auto visitor = llvm_dialects::VisitorBuilder<LoweringVisitorPayload>()
+                                  .nest<LowerExecutionGraph>([](auto &b) {
+                                    b.add(&LowerExecutionGraph::visitLoad);
+                                    b.add(&LowerExecutionGraph::visitStore);
+                                    b.add(&LowerExecutionGraph::visitAlloca);
+                                    b.add(&LowerExecutionGraph::visitGetElementPtr);
+                                    b.add(&LowerExecutionGraph::visitIndexPayloadArray);
+                                    b.add(&LowerExecutionGraph::visitAllocateNodePayloads);
+                                    b.add(&LowerExecutionGraph::visitEnqueueNodePayloads);
+                                    b.add(&LowerExecutionGraph::visitPayloadArrayLength);
+                                    b.add(&LowerExecutionGraph::visitIsNodePayloadValid);
+                                    b.add(&LowerExecutionGraph::visitFinishWritingNodePayload);
+                                  })
+                                  .nest(&TypeLowering::registerVisitors)
+                                  .build();
+
+  visitor.visit(payload, *m_module);
+  payload.typeLower.finishPhis();
+  payload.typeLower.finishCleanup();
+  m_typeLowering = nullptr;
+  buildExecGraphNodeMetadata(m_enqueueModes, m_inputPayloadInfo);
+  lowerGlobals(m_metaEnqueueId, m_context->getMDKindID(gSPIRVMD::InOut));
+  unsigned outputCount = m_nodeNamesIdx.size();
+  createGraphLds(outputCount);
+  // Post visit dialects after Workgraph library functions inlined
+  static const auto postVisitor = llvm_dialects::VisitorBuilder<LowerExecutionGraph>()
+                                      .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                                      .add(&LowerExecutionGraph::visitGraphGetLds)
+                                      .add(&LowerExecutionGraph::visitOutputCount)
+                                      .build();
+  postVisitor.visit(*this, *m_module);
+
+  return PreservedAnalyses::none();
+}
+
+// =====================================================================================================================
+// Pre-parse to the RegisterOutputNodeOp to get number of node types/names, and setup m_nodeNamesIdx
+//
+// @param inst : the instruction to lower
+void LowerExecutionGraph::visitRegisterOutputNode(lgc::wg::RegisterOutputNodeOp &inst) {
+  static const unsigned remappedScopes[3] = {WorkCreationScope::Workgroup, WorkCreationScope::Subgroup,
+                                             WorkCreationScope::Invocation};
+  unsigned scope = inst.getScope();
+  assert(scope == ScopeWorkgroup || scope == ScopeSubgroup || scope == ScopeInvocation);
+  unsigned remappedScope = remappedScopes[scope - ScopeWorkgroup];
+
+  auto payloadNameVar = cast<GlobalVariable>(inst.getPayloadName());
+  auto payloadName = cast<ConstantDataArray>(payloadNameVar->getInitializer())->getAsString();
+
+  unsigned payloadSize = inst.getPayloadSize();
+  unsigned payloadMaxCount = inst.getPayloadMaxCount();
+  unsigned payloadId = inst.getPayloadId();
+  unsigned limitsSharedWith = inst.getLimitsSharedWith();
+  bool trackFinishWriting = inst.getTrackFinishWriting();
+  unsigned payloadArrayTyId = inst.getArrayTypeId();
+
+  auto nameIter = m_nodeNamesIdx.find(payloadName);
+  if (nameIter == m_nodeNamesIdx.end()) {
+    m_nodeNamesIdx[payloadName] = {payloadMaxCount,     payloadSize,      payloadId,
+                                   limitsSharedWith,    remappedScope,    trackFinishWriting,
+                                   inst.getArraySize(), payloadArrayTyId, 0};
+    nameIter = m_nodeNamesIdx.find(payloadName);
+  } else {
+    // Add up the payloadMaxCount for the same output node
+    nameIter->second.payloadCount += payloadMaxCount;
+    nameIter->second.payloadSize = std::max(nameIter->second.payloadSize, payloadSize);
+  }
+  m_typeLowering->eraseInstruction(&inst);
+}
+
+// =====================================================================================================================
+// Lower an allocate.node.payloads op
+//
+// @param inst : the instruction to lower
+void LowerExecutionGraph::visitAllocateNodePayloads(lgc::wg::AllocateNodePayloadsOp &inst) {
+  m_builder->SetInsertPoint(&inst);
+  auto payloadNameVar = cast<GlobalVariable>(inst.getPayloadName());
+  auto payloadName = cast<ConstantDataArray>(payloadNameVar->getInitializer())->getAsString();
+  auto nameIter = m_nodeNamesIdx.find(payloadName);
+
+  auto baseIndex = inst.getBaseIndex();
+
+  m_builder->CreateStore(m_builder->getInt32(nameIter->second.scope), m_outputAllocateArgs[OutputAllocateArg::Scope]);
+
+  // MapVector will keep the index of insertion, so the OutputIndex would be index of Output payload nodes names array.
+  // Each array member must have a unique node name, array index is the specific shader in that array.
+  auto OutputIndex = nameIter - m_nodeNamesIdx.begin();
+  m_builder->CreateStore(m_builder->getInt32(OutputIndex), m_outputAllocateArgs[OutputAllocateArg::OutputIdx]);
+
+  m_builder->CreateStore(inst.getPayloadCount(), m_outputAllocateArgs[OutputAllocateArg::Count]);
+
+  Value *nodeIdx = inst.getNodeIndex();
+  nodeIdx = m_builder->CreateAdd(nodeIdx, baseIndex);
+  const bool recursiveNode =
+      (m_enqueueModes.modes.maxNodeRecursion > 0) && (payloadName == m_inputPayloadInfo.nodeName);
+  if (recursiveNode) {
+    // NOTE: Always needs to be 0 for recursive calls since recursive output ports always have an
+    // array index offset equal to the parent. No need to check the array index provided by the
+    // app since the only legal case is self-recursion (the node calling itself, same name, same index).
+    nodeIdx = m_builder->getInt32(0);
+  }
+  m_builder->CreateStore(nodeIdx, m_outputAllocateArgs[OutputAllocateArg::ArrayIdx]);
+
+  // Call OutputAllocate
+  SmallVector<Value *, 5> args;
+  for (auto arg : m_outputAllocateArgs) {
+    args.push_back(arg);
+  }
+  CrossModuleInliner inliner;
+  Value *outputRecords = nullptr;
+  if (nameIter->second.payloadSize == 0) {
+    inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::IncrementEmptyOutputCount], args);
+    outputRecords = PoisonValue::get(getOutputRecordsTy());
+  } else {
+    outputRecords = inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::OutputAllocate], args).returnValue;
+  }
+  auto dummyValue = ConstantPointerNull::get(PointerType::get(*m_context, SPIRAS_Private));
+  m_typeLowering->replaceInstruction(&inst, {dummyValue, outputRecords});
+}
+
+// =====================================================================================================================
+// Lower an enqueue.node.payloads op
+//
+// @param inst : the instruction to lower
+void LowerExecutionGraph::visitEnqueueNodePayloads(lgc::wg::EnqueueNodePayloadsOp &inst) {
+  m_builder->SetInsertPoint(&inst);
+  auto *payloadArrayPtr = m_typeLowering->getValue(inst.getPayloads())[0];
+  auto payloadNameVar = cast<GlobalVariable>(inst.getPayloadName());
+  auto payloadName = cast<ConstantDataArray>(payloadNameVar->getInitializer())->getAsString();
+
+  auto nameIter = m_nodeNamesIdx.find(payloadName);
+  assert(nameIter != m_nodeNamesIdx.end());
+  m_builder->CreateStore(m_builder->getInt32(nameIter->second.scope), m_outputAllocateArgs[OutputAllocateArg::Scope]);
+
+  SmallVector<Value *, 3> args = {m_outputAllocateArgs[OutputAllocateArg::ShaderState],
+                                  m_outputAllocateArgs[OutputAllocateArg::Scope], payloadArrayPtr};
+  CrossModuleInliner inliner;
+  if (nameIter->second.trackFinishWriting) {
+    inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::InitCrossGroupSharing], args);
+  }
+
+  inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::OutputCommit], args);
+
+  m_typeLowering->eraseInstruction(&inst);
+  m_funcsToLower.insert(inst.getCalledFunction());
+}
+
+// =====================================================================================================================
+// Lower a finish.writing.node.payload op
+//
+// @param inst : the instruction to lower
+void LowerExecutionGraph::visitFinishWritingNodePayload(wg::FinishWritingNodePayloadOp &inst) {
+  m_builder->SetInsertPoint(&inst);
+  CrossModuleInliner inliner;
+  inst.replaceAllUsesWith(inliner
+                              .inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::FinishCrossGroupSharing],
+                                          {m_outputAllocateArgs[OutputAllocateArg::ShaderState]})
+                              .returnValue);
+
+  m_typeLowering->eraseInstruction(&inst);
+  m_funcsToLower.insert(inst.getCalledFunction());
+}
+
+// =====================================================================================================================
+// Lower a payload.array.length op
+//
+// @param inst : the instruction to lower
+void LowerExecutionGraph::visitPayloadArrayLength(wg::PayloadArrayLengthOp &inst) {
+  m_builder->SetInsertPoint(&inst);
+  Value *nodeCount = nullptr;
+  if (inst.getInput()) {
+    nodeCount =
+        m_builder->CreateLoad(m_builder->getInt32Ty(), m_builtInVariables[WorkGraphBuiltIns::CoalescedInputCount]);
+  } else {
+    // Output variable
+    auto *payloadArrayPtr = m_typeLowering->getValue(inst.getPayloads())[0];
+    Value *args[] = {m_builder->getInt32(0), m_builder->getInt32(4)};
+    nodeCount = m_builder->CreateGEP(m_payloadArrayPtrType, payloadArrayPtr, args);
+    nodeCount = m_builder->CreateLoad(m_builder->getInt32Ty(), nodeCount);
+  }
+  inst.replaceAllUsesWith(nodeCount);
+  m_typeLowering->eraseInstruction(&inst);
+  m_funcsToLower.insert(inst.getCalledFunction());
+}
+
+// =====================================================================================================================
+// Lower a LoadInst instruction
+//
+// @param inst : the instruction to lower
+VisitorResult LowerExecutionGraph::visitLoad(LoadInst &inst) {
+  m_builder->SetInsertPoint(&inst);
+  if (inst.getPointerOperandType()->getPointerAddressSpace() == SPIRAS_PayloadArray) {
+    Value *outputRecord = m_typeLowering->getValue(inst.getPointerOperand())[1];
+    m_typeLowering->replaceInstruction(&inst, outputRecord);
+  }
+  return VisitorResult::Stop;
+}
+
+// =====================================================================================================================
+// Lower a StoreInst instruction
+//
+// @param inst : the instruction to lower
+VisitorResult LowerExecutionGraph::visitStore(StoreInst &inst) {
+  m_builder->SetInsertPoint(&inst);
+  if (inst.getPointerOperandType()->getPointerAddressSpace() == SPIRAS_PayloadArray) {
+    auto ptrOperand = inst.getPointerOperand();
+    Value *newPtrOperand = m_typeLowering->getValue(ptrOperand)[0];
+    Value *newVal = m_typeLowering->getValue(inst.getValueOperand())[0];
+    m_builder->CreateStore(newVal, newPtrOperand, inst.isVolatile());
+    m_typeLowering->eraseInstruction(&inst);
+  }
+  return VisitorResult::Stop;
+}
+
+// =====================================================================================================================
+// Lower an AllocInst
+//
+// @param inst : the instruction to lower
+VisitorResult LowerExecutionGraph::visitAlloca(AllocaInst &inst) {
+  m_builder->SetInsertPoint(&inst);
+  if (inst.getAddressSpace() == SPIRAS_PayloadArray) {
+    Type *allocTy = replacePayloadType(inst.getAllocatedType());
+    auto newAlloc = m_builder->CreateAlloca(allocTy);
+    auto dummyValue = PoisonValue::get(m_payloadArrayPtrType);
+    m_typeLowering->replaceInstruction(&inst, {newAlloc, dummyValue});
+  }
+  return VisitorResult::Stop;
+}
+
+// =====================================================================================================================
+// Lower a GetElementPtrInst
+//
+// @param inst : the instruction to lower
+VisitorResult LowerExecutionGraph::visitGetElementPtr(GetElementPtrInst &inst) {
+  m_builder->SetInsertPoint(&inst);
+  if (inst.getAddressSpace() == SPIRAS_PayloadArray) {
+    Type *gepTy = replacePayloadType(inst.getSourceElementType());
+    Value *srcElement = m_typeLowering->getValue(inst.getPointerOperand())[0];
+    Value *newGep = nullptr;
+    SmallVector<Value *, 8> indices(inst.idx_begin(), inst.idx_end());
+    if (inst.isInBounds())
+      newGep = m_builder->CreateInBoundsGEP(gepTy, srcElement, indices);
+    else
+      newGep = m_builder->CreateGEP(gepTy, srcElement, indices);
+    auto dummyValue = PoisonValue::get(m_payloadArrayPtrType);
+    m_typeLowering->replaceInstruction(&inst, {newGep, dummyValue});
+  }
+  return VisitorResult::Stop;
+}
+
+// =====================================================================================================================
+// Recursive replace {} to the OutputRecordType in the aggregation type
+//
+// @param ty : The type to replace
+Type *LowerExecutionGraph::replacePayloadType(Type *ty) {
+  if (isPayloadType(ty)) {
+    return m_payloadArrayPtrType;
+  } else if (ty->isStructTy()) {
+    SmallVector<Type *> elemTys;
+    for (unsigned i = 0; i < ty->getStructNumElements(); ++i)
+      elemTys.push_back(replacePayloadType(ty->getStructElementType(i)));
+    return StructType::get(*m_context, elemTys);
+  } else if (ty->isArrayTy()) {
+    return ArrayType::get(replacePayloadType(ty->getArrayElementType()), ty->getArrayNumElements());
+  } else
+    return ty;
+}
+
+// =====================================================================================================================
+// Lower an is.node.payload.valid
+//
+// @param inst : the instruction to lower
+void LowerExecutionGraph::visitIsNodePayloadValid(wg::IsNodePayloadValidOp &inst) {
+  m_builder->SetInsertPoint(&inst);
+  auto payloadNameVar = cast<GlobalVariable>(inst.getPayloadName());
+  auto payloadName = cast<ConstantDataArray>(payloadNameVar->getInitializer())->getAsString();
+  auto nameIter = m_nodeNamesIdx.find(payloadName);
+  auto OutputIndex = nameIter - m_nodeNamesIdx.begin();
+  m_builder->CreateStore(m_builder->getInt32(OutputIndex), m_outputAllocateArgs[OutputAllocateArg::OutputIdx]);
+  m_builder->CreateStore(inst.getNodeIndex(), m_outputAllocateArgs[OutputAllocateArg::ArrayIdx]);
+  Value *args[] = {m_outputAllocateArgs[OutputAllocateArg::ShaderState],
+                   m_outputAllocateArgs[OutputAllocateArg::OutputIdx],
+                   m_outputAllocateArgs[OutputAllocateArg::ArrayIdx]};
+  CrossModuleInliner inliner;
+  Value *isValid =
+      inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::IsOutputNodePresent], args).returnValue;
+  inst.replaceAllUsesWith(isValid);
+
+  m_typeLowering->eraseInstruction(&inst);
+  m_funcsToLower.insert(inst.getCalledFunction());
+}
+
+// =====================================================================================================================
+// Create global variables
+//
+// @param builder : The builder to create variable
+void LowerExecutionGraph::initAllocVariables(lgc::Builder *builder) {
+  Type *tys[] = {getShaderStateTy(), m_builder->getInt32Ty(), m_builder->getInt32Ty(), m_builder->getInt32Ty(),
+                 m_builder->getInt32Ty()};
+
+  for (unsigned i = 0; i < m_outputAllocateArgs.size(); ++i) {
+    m_outputAllocateArgs[i] = m_builder->CreateAlloca(tys[i], nullptr, Twine(OutputArgNames[i]));
+  }
+  m_tempVariable = m_builder->CreateAlloca(m_builder->getInt32Ty(), nullptr, Twine("tempVariable"));
+  auto int32x3Ty = FixedVectorType::get(m_builder->getInt32Ty(), 3);
+  Type *builtInTys[] = {m_builder->getInt32Ty(), int32x3Ty, int32x3Ty, m_builder->getInt32Ty(), m_builder->getInt32Ty(),
+                        m_builder->getInt32Ty()};
+
+  for (unsigned i = 0; i < WorkGraphBuiltIns::Count; ++i) {
+    m_builtInVariables[i] =
+        new GlobalVariable(*m_module, builtInTys[i], false, GlobalVariable::ExternalLinkage, nullptr,
+                           Twine("builtIn") + std::to_string(i), nullptr, GlobalValue::NotThreadLocal, SPIRAS_Private);
+  }
+}
+
+// =====================================================================================================================
+// Get AmdWorkGraphsShaderState type
+Type *LowerExecutionGraph::getShaderStateTy() {
+  return m_graphLibFuncs[WorkGraphFunc::ShaderPreamble]->getReturnType();
+}
+
+// =====================================================================================================================
+// Get OutputRecords type
+Type *LowerExecutionGraph::getOutputRecordsTy() {
+  return m_graphLibFuncs[WorkGraphFunc::OutputAllocate]->getReturnType();
+}
+
+// =====================================================================================================================
+// Get all the function ReturnInst
+//
+// @param func : Function to gather ReturnInst
+// @param rets : Returned vector of  ReturnInst instructions
+void LowerExecutionGraph::getFuncRets(Function *func, SmallVector<Instruction *, 4> &rets) {
+  for (auto &block : *func) {
+    auto blockTerm = block.getTerminator();
+    if (blockTerm != nullptr && isa<ReturnInst>(blockTerm))
+      rets.push_back(blockTerm);
+  }
+}
+
+// =====================================================================================================================
+// Lower the builtin and workgraph global variables
+//
+// @param enqueueMetaId : Metadata for the workgraph variables
+// @param inoutMetaId : Metadata for the built-in variables
+void LowerExecutionGraph::lowerGlobals(unsigned enqueueMetaId, unsigned inoutMetaId) {
+  for (Function *func : m_funcsToLower) {
+    func->dropAllReferences();
+    func->eraseFromParent();
+  }
+
+  SmallVector<Instruction *, 4> geps;
+  for (auto globalIt = m_module->global_begin(), end = m_module->global_end(); globalIt != end;) {
+    GlobalVariable *global = &*globalIt++;
+    auto meta = global->getMetadata(enqueueMetaId);
+    if (meta != nullptr) {
+      global->eraseFromParent();
+    } else if ((meta = global->getMetadata(inoutMetaId)) != nullptr) {
+      processBuiltinGlobals(global, meta);
+    }
+  }
+}
+
+// =====================================================================================================================
+// Lower the built-in global variables
+//
+// @param global : Global variables to lower
+// @param metadata : Metadata for the built-in variables
+void LowerExecutionGraph::processBuiltinGlobals(GlobalVariable *global, MDNode *metadata) {
+  auto meta = mdconst::dyn_extract<Constant>(metadata->getOperand(0));
+  unsigned startOperand = 0;
+  Type *globalTy = global->getValueType();
+  if (globalTy->isArrayTy()) {
+    assert(meta->getNumOperands() == 4);
+    startOperand += 2;
+  }
+  ShaderInOutMetadata inputMeta = {};
+  inputMeta.U64All[0] = cast<ConstantInt>(meta->getOperand(startOperand))->getZExtValue();
+  inputMeta.U64All[1] = cast<ConstantInt>(meta->getOperand(startOperand + 1))->getZExtValue();
+  llvm::GlobalVariable *replacement = nullptr;
+  switch (inputMeta.Value) {
+  case spv::BuiltInWorkgroupId:
+    replacement = m_builtInVariables[WorkGraphBuiltIns::WorkgroupId];
+    break;
+  case spv::BuiltInGlobalInvocationId:
+    replacement = m_builtInVariables[WorkGraphBuiltIns::GlobalInvocationId];
+    break;
+  case spv::BuiltInLocalInvocationId:
+  case spv::BuiltInLocalInvocationIndex: {
+    if (!m_threadLaunch)
+      return;
+    replacement = inputMeta.Value == spv::BuiltInLocalInvocationId
+                      ? m_builtInVariables[WorkGraphBuiltIns::GlobalInvocationId]
+                      : m_localInvocationIndex;
+    break;
+  }
+  case spv::BuiltInShaderIndexAMDX:
+    replacement = m_builtInVariables[WorkGraphBuiltIns::ShaderIndex];
+    break;
+  case spv::BuiltInRemainingRecursionLevelsAMDX: {
+    replacement = m_builtInVariables[WorkGraphBuiltIns::RemainingRecursionLevels];
+  } break;
+  default:
+    // For other builtin Globals, return
+    return;
+  }
+  global->mutateType(replacement->getType());
+  replaceGlobal(m_context, global, replacement);
+}
+
+// =====================================================================================================================
+// Fill m_inputPayloadInfo with payload metadata and ShaderEnqueue mode
+//
+// @param enqueueModes : Workgraph shader enqueue modes
+void LowerExecutionGraph::initInputPayloadInfo(const lgc::wg::ShaderEnqueueMode &enqueueModes) {
+  m_inputPayloadInfo = {"", InvalidValue, 0, 0, false, InvalidValue, InvalidValue, InvalidValue, InvalidValue};
+  auto moduleMetadata = m_module->getNamedMetadata(lgc::wg::ShaderEnqueue);
+  MDNode *payloadMeta = moduleMetadata->getOperand(moduleMetadata->getNumOperands() - 1);
+  m_inputPayloadInfo.nodeName = cast<MDString>(payloadMeta->getOperand(0))->getString();
+  auto arrayIndexMeta = cast<ConstantAsMetadata>(payloadMeta->getOperand(1));
+  m_inputPayloadInfo.arrayIndex = cast<ConstantInt>(arrayIndexMeta->getValue())->getZExtValue();
+
+  if (moduleMetadata->getNumOperands() > 1) {
+    payloadMeta = moduleMetadata->getOperand(0);
+    auto maxPayloadMeta = cast<ConstantAsMetadata>(payloadMeta->getOperand(0));
+    m_inputPayloadInfo.payloadCount = cast<ConstantInt>(maxPayloadMeta->getValue())->getZExtValue();
+    auto payloadSizeMeta = cast<ConstantAsMetadata>(payloadMeta->getOperand(1));
+    m_inputPayloadInfo.payloadSize = cast<ConstantInt>(payloadSizeMeta->getValue())->getZExtValue();
+    auto trackFinishWritingMeta = cast<ConstantAsMetadata>(payloadMeta->getOperand(2));
+    m_inputPayloadInfo.trackFinishWriting = cast<ConstantInt>(trackFinishWritingMeta->getValue())->isOne();
+    auto dynamicDispatchMeta = cast<ConstantAsMetadata>(payloadMeta->getOperand(3));
+    m_inputPayloadInfo.dynamicDispatch = cast<ConstantInt>(dynamicDispatchMeta->getValue())->getZExtValue();
+    auto nodeTypeMeta = cast<ConstantAsMetadata>(payloadMeta->getOperand(4));
+    m_inputPayloadInfo.nodeType = cast<ConstantInt>(nodeTypeMeta->getValue())->getZExtValue();
+    auto vbTableOffsetMeta = cast<ConstantAsMetadata>(payloadMeta->getOperand(5));
+    m_inputPayloadInfo.vbTableOffset = cast<ConstantInt>(vbTableOffsetMeta->getValue())->getZExtValue();
+    auto indexBufferOffsetMeta = cast<ConstantAsMetadata>(payloadMeta->getOperand(6));
+    m_inputPayloadInfo.indexBufferOffset = cast<ConstantInt>(indexBufferOffsetMeta->getValue())->getZExtValue();
+  }
+}
+
+// =====================================================================================================================
+// Build the ExecutionGraph PAL metadata
+//
+// @param enqueueModes : ShaderEnqueueMode mode
+// @param payloads : Payload size and count
+void LowerExecutionGraph::buildExecGraphNodeMetadata(const ShaderEnqueueMode &enqueueModes,
+                                                     const InputPayloadInfo &payloads) {
+
+  lgc::GraphNodeMetadata graphNodeMeta = {};
+  graphNodeMeta.payloadMaxCount = payloads.payloadCount;
+  graphNodeMeta.payloadSize = payloads.payloadSize;
+  graphNodeMeta.maxRecursionDepth = enqueueModes.modes.maxNodeRecursion;
+  graphNodeMeta.node.name = payloads.nodeName;
+  graphNodeMeta.node.arrayIndex =
+      payloads.arrayIndex != InvalidValue ? payloads.arrayIndex : enqueueModes.modes.shaderIndex;
+  graphNodeMeta.inputSharedWith.name = m_inputSharedWithName;
+  graphNodeMeta.inputSharedWith.arrayIndex = enqueueModes.modes.inputSharedWithArrayIndex;
+  graphNodeMeta.payloadFlags.crossGroupSharing = payloads.trackFinishWriting;
+
+  if (payloads.dynamicDispatch != InvalidValue) {
+    graphNodeMeta.dynamicDispatchGrid.componentCount = payloads.dynamicDispatch >> 24;
+    graphNodeMeta.dynamicDispatchGrid.bitsPerComponent = (payloads.dynamicDispatch >> 16) & 0xff;
+    graphNodeMeta.dynamicDispatchGrid.offset = payloads.dynamicDispatch & 0xffff;
+  } else {
+    graphNodeMeta.dynamicDispatchGrid.componentCount = 3;
+    graphNodeMeta.dynamicDispatchGrid.bitsPerComponent = (sizeof(unsigned) << 3);
+    graphNodeMeta.dynamicDispatchGrid.offset = 0;
+  }
+
+  graphNodeMeta.outputs.resize(m_nodeNamesIdx.size());
+  unsigned outIdx = 0;
+  for (auto &nodeName : m_nodeNamesIdx) {
+    NodeShaderOutputInfo &outputInfo = graphNodeMeta.outputs[outIdx++];
+
+    bool recursiveNode = (enqueueModes.modes.maxNodeRecursion > 0) && (nodeName.first == graphNodeMeta.node.name);
+    outputInfo.node.arrayIndex = recursiveNode ? graphNodeMeta.node.arrayIndex : 0;
+    outputInfo.arrayCount = recursiveNode ? 1 : UINT_MAX;
+    // NOTE: It is a workaround of test issue; revisit once the spec has been updated
+    outputInfo.payloadMaxCount = std::min(nodeName.second.payloadCount, 256u);
+    outputInfo.payloadSize = nodeName.second.payloadSize;
+    outputInfo.payloadFlags.crossGroupSharing = nodeName.second.trackFinishWriting;
+    // Copy name
+    outputInfo.node.name = nodeName.first.str();
+
+    bool validPayloadIdToShare = nodeName.second.limitSharedWith != InvalidValue;
+    outputInfo.budgetSharedWith.enable = validPayloadIdToShare;
+    outputInfo.budgetSharedWith.index = validPayloadIdToShare ? getOutputIndex(nodeName.second.limitSharedWith) : 0;
+  }
+
+  // Determine the graph node type
+  // If static dispatch size is provided -> Fixed expansion
+  // If coalescing mode is provided -> Coalescing
+  // Otherwise -> Dynamic expansion
+  if (enqueueModes.modes.staticNumWorkgroupsX != 0) {
+    assert(enqueueModes.modes.staticNumWorkgroupsX != 0 && enqueueModes.modes.staticNumWorkgroupsY != 0 &&
+           enqueueModes.modes.staticNumWorkgroupsZ != 0);
+    assert(enqueueModes.modes.maxNumWorkgroupsX == 0 && enqueueModes.modes.maxNumWorkgroupsY == 0 &&
+           enqueueModes.modes.maxNumWorkgroupsZ == 0);
+    assert(enqueueModes.modes.isCoalescing == false);
+    graphNodeMeta.nodeType = GraphNodeTypeFixedExpansion;
+
+    graphNodeMeta.dispatchGridX = enqueueModes.modes.staticNumWorkgroupsX;
+    graphNodeMeta.dispatchGridY = enqueueModes.modes.staticNumWorkgroupsY;
+    graphNodeMeta.dispatchGridZ = enqueueModes.modes.staticNumWorkgroupsZ;
+  } else if (enqueueModes.modes.isCoalescing) {
+    assert(enqueueModes.modes.staticNumWorkgroupsX == 0 && enqueueModes.modes.staticNumWorkgroupsY == 0 &&
+           enqueueModes.modes.staticNumWorkgroupsZ == 0);
+    assert(enqueueModes.modes.maxNumWorkgroupsX == 0 && enqueueModes.modes.maxNumWorkgroupsY == 0 &&
+           enqueueModes.modes.maxNumWorkgroupsZ == 0);
+    graphNodeMeta.nodeType = m_threadLaunch ? GraphNodeTypeThreadLaunch : GraphNodeTypeCoalescing;
+  } else {
+    assert(enqueueModes.modes.staticNumWorkgroupsX == 0 && enqueueModes.modes.staticNumWorkgroupsY == 0 &&
+           enqueueModes.modes.staticNumWorkgroupsZ == 0);
+    assert(enqueueModes.modes.isCoalescing == false);
+    graphNodeMeta.nodeType = GraphNodeTypeDynamicExpansion;
+    graphNodeMeta.dispatchGridX = enqueueModes.modes.maxNumWorkgroupsX;
+    graphNodeMeta.dispatchGridY = enqueueModes.modes.maxNumWorkgroupsY;
+    graphNodeMeta.dispatchGridZ = enqueueModes.modes.maxNumWorkgroupsZ;
+    // Payload not explicitly declared, but it must exist and contain at least the dispatch size
+    if (graphNodeMeta.payloadSize == 0) {
+      graphNodeMeta.payloadSize = 12;
+      graphNodeMeta.payloadMaxCount = 1;
+    }
+
+    // The shader didn't provide MaxNumWorkgroupsAMDX, fall back to the max limit
+    if (graphNodeMeta.dispatchGridX == 0) {
+      graphNodeMeta.dispatchGridX = MaxGridCount;
+      graphNodeMeta.dispatchGridY = MaxGridCount;
+      graphNodeMeta.dispatchGridZ = MaxGridCount;
+    }
+  }
+
+  // Affects PatchPreparePipelineAbi::setAbiEntryNames() for compute shaders.
+  m_pipeline->setGraphMetadata(graphNodeMeta);
+}
+
+// =====================================================================================================================
+// Get output node index
+
+// @param payloadId : Output payload id
+unsigned LowerExecutionGraph::getOutputIndex(unsigned id) {
+  unsigned outIdx = 0;
+  for (auto &nodeName : m_nodeNamesIdx) {
+    // The SPIR-V spec expects the decoration to refer to an array type's id.
+    // String name's id is a fallback for glslang compatibility.
+    if ((nodeName.second.arrayTypeId == id) || (nodeName.second.payloadId == id))
+      return outIdx;
+    outIdx++;
+  }
+  llvm_unreachable("Should find payloadId");
+  return outIdx;
+}
+
+// =====================================================================================================================
+// Lower dialect IndexPayloadArrayOp
+//
+// @param [in] inst : IndexPayloadArrayOp to lower
+void LowerExecutionGraph::visitIndexPayloadArray(lgc::wg::IndexPayloadArrayOp &inst) {
+  m_builder->SetInsertPoint(&inst);
+  CrossModuleInliner inliner;
+  Value *payloadAddr = nullptr;
+  bool isInput = cast<ConstantInt>(inst.getInput())->isOne();
+  if (isInput) {
+    Value *indexValue =
+        m_threadLaunch ? m_builder->CreateReadBuiltInInput(lgc::BuiltInLocalInvocationIndex) : inst.getIndex();
+    m_builder->CreateStore(indexValue, m_tempVariable);
+
+    Value *args[] = {m_outputAllocateArgs[OutputAllocateArg::ShaderState], m_tempVariable};
+    payloadAddr =
+        inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::InputGetPayloadAtIndex], args).returnValue;
+  } else {
+    m_builder->CreateStore(inst.getIndex(), m_tempVariable);
+    auto payloadArray = m_typeLowering->getValue(inst.getPayloadArray())[0];
+    Value *args[] = {payloadArray, m_tempVariable};
+    payloadAddr = inliner.inlineCall(*m_builder, m_graphLibFuncs[WorkGraphFunc::OutputGetPayload], args).returnValue;
+  }
+  payloadAddr = m_builder->CreateIntToPtr(payloadAddr, PointerType::get(*m_context, SPIRAS_Global));
+  // TODO: currently recursive set GEP chain load/store as volatile to make payload access
+  // coherent, aka, load glc/dlc.
+  // correctly represent memory model semantics once backend is ready
+  std::function<void(Value *)> setLoadStore = [&](Value *nodearray) {
+    for (Use &use : nodearray->uses()) {
+      Instruction *chainedUser = cast<Instruction>(use.getUser());
+      if (auto loadInst = dyn_cast<LoadInst>(chainedUser)) {
+        loadInst->setVolatile(true);
+      } else if (auto storeInst = dyn_cast<StoreInst>(chainedUser)) {
+        storeInst->setVolatile(true);
+      } else {
+        auto gepInst = cast<GetElementPtrInst>(chainedUser);
+        gepInst->mutateType(nodearray->getType());
+        setLoadStore(gepInst);
+      }
+    }
+  };
+
+  setLoadStore(&inst);
+  inst.replaceAllUsesWith(payloadAddr);
+  m_typeLowering->eraseInstruction(&inst);
+  m_funcsToLower.insert(inst.getCalledFunction());
+}
+
+// =====================================================================================================================
+// Get input payload
+
+// @param enqueueMetaId : shader enqueue metadata ID
+GlobalVariable *LowerExecutionGraph::getInputPayload(unsigned enqueueMetaId) {
+  for (auto &global : m_module->globals()) {
+    if (global.getMetadata(enqueueMetaId)) {
+      return &global;
+    }
+  }
+  return nullptr;
+}
+
+// =====================================================================================================================
+// Is thread node
+
+// @param shaderMode : compute shader mode
+// @param enqueueModes : enqueue mode
+// @param payload : payload
+bool LowerExecutionGraph::isThreadLaunchNode(const lgc::ComputeShaderMode &shaderMode,
+                                             const ShaderEnqueueMode &enqueueModes, const InputPayloadInfo &payloads) {
+
+  // Workgroup size is 1, 1, 1
+  bool threadLaunch = shaderMode.workgroupSizeX == 1;
+  threadLaunch = threadLaunch && (shaderMode.workgroupSizeY == 1);
+  threadLaunch = threadLaunch && (shaderMode.workgroupSizeZ == 1);
+  // Must be coalescing node.
+  threadLaunch = threadLaunch && enqueueModes.modes.isCoalescing;
+  // If there is input payload, then input payload count is 1
+  threadLaunch = threadLaunch && (payloads.payloadCount <= 1);
+
+  // Less than 8 allocation nodes
+  threadLaunch = threadLaunch && (m_nodeNamesIdx.size() < 8);
+  // Compute shader does not use lds
+  for (auto &global : m_module->globals()) {
+    if (global.getAddressSpace() == SPIRAS_Local) {
+      threadLaunch = false;
+      break;
+    }
+  }
+  return threadLaunch;
+}
+
+// =====================================================================================================================
+// Create Lds memory for the output graph nodes
+
+// @param outputCount : Number of node output
+void LowerExecutionGraph::createGraphLds(unsigned outputCount) {
+  if (m_graphLds == nullptr) {
+    // - base_wptr_transfer
+    // - last_group_transfer
+    // - allocation_counts[num_outputs]
+    auto ldsSize = outputCount + 2;
+    auto ldsTy = ArrayType::get(m_builder->getInt32Ty(), ldsSize);
+    m_graphLds = new GlobalVariable(*m_module, ldsTy, false, GlobalValue::ExternalLinkage, nullptr, "GraphLds", nullptr,
+                                    GlobalValue::NotThreadLocal, SPIRAS_Local);
+  }
+}
+
+// =====================================================================================================================
+// Create OutputCountOp used for the execution graph library
+//
+// @param [in] inst : OutputCountOp to lower
+void LowerExecutionGraph::visitOutputCount(wg::OutputCountOp &inst) {
+  m_builder->SetInsertPoint(&inst);
+  auto outputCount = m_builder->getInt32(m_nodeNamesIdx.size());
+  inst.replaceAllUsesWith(outputCount);
+}
+
+// =====================================================================================================================
+// Visit GraphgetLdsOp used for the execution graph library
+//
+// @param [in] inst : GraphGetLdsOp to lower
+void LowerExecutionGraph::visitGraphGetLds(wg::GraphGetLdsOp &inst) {
+  auto retTy = PointerType::get(m_builder->getInt32Ty(), SPIRAS_Local);
+  m_builder->SetInsertPoint(&inst);
+  assert(m_graphLds != nullptr);
+  auto ldsPtr = m_builder->CreateGEP(m_builder->getInt32Ty(), m_graphLds, m_builder->getInt32(0));
+  ldsPtr = m_builder->CreateBitCast(ldsPtr, retTy);
+  inst.replaceAllUsesWith(ldsPtr);
+}
+
+} // namespace Llpc
diff --git a/llpc/lowering/LowerExecutionGraph.h b/llpc/lowering/LowerExecutionGraph.h
new file mode 100644
index 0000000000..6ba637385b
--- /dev/null
+++ b/llpc/lowering/LowerExecutionGraph.h
@@ -0,0 +1,157 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  LowerExecutionGraph.h
+ * @brief LLPC header file: contains declaration of Llpc::LowerExecutionGraph
+ ***********************************************************************************************************************
+ */
+#pragma once
+
+#include "Lowering.h"
+#include "SPIRVInternal.h"
+#include "lgc/LgcWgDialect.h"
+#include "llvm-dialects/Dialect/Visitor.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/PassManager.h"
+
+namespace CompilerUtils {
+class TypeLowering;
+} // namespace CompilerUtils
+
+namespace lgc {
+class Pipeline;
+struct ComputeShaderMode;
+} // namespace lgc
+
+namespace Llpc {
+
+namespace WorkCreationScope {
+enum : unsigned {
+  Invocation = 0, // WorkCreation library invocation scope
+  Workgroup = 1,  // WorkCreation library workgroup scope
+  Subgroup = 2    // WorkCreation library subgroup scope
+};
+}
+
+namespace WorkGraphBuiltIns {
+enum : unsigned {
+  CoalescedInputCount = 0,  // SPIRV CoalescedInputCount
+  WorkgroupId,              // SPIRV WorkgroupId
+  GlobalInvocationId,       // SPIRV GlobalInvocationId
+  ShaderIndex,              // SPIRV ShaderIndex
+  RemainingRecursionLevels, // SPIRV RemainingRecursionLevels
+  LocalInvocationIndex,     // SPIRV GlobalInvocationId
+  Count
+};
+}
+
+namespace OutputAllocateArg {
+enum : unsigned { ShaderState = 0, Scope, OutputIdx, ArrayIdx, Count };
+}
+
+// =====================================================================================================================
+// Represents the pass of SPIR-V lowering shader enqueue opcode
+class LowerExecutionGraph : public SpirvLower, public llvm::PassInfoMixin<LowerExecutionGraph> {
+
+  struct OutputPayloadInfo {
+    unsigned payloadCount;    // Payload Count
+    unsigned payloadSize;     // Payload Size
+    unsigned payloadId;       // Payload id
+    unsigned limitSharedWith; // payload id to share with limit
+    unsigned scope;           // created scope
+    bool trackFinishWriting;  // Whether this payload need to track finish writing
+    unsigned arraySize;       // Payload array size
+    unsigned arrayTypeId;     // Payload array type's id
+    unsigned dynamicDispatch; // DynamicDispatch;
+  };
+
+  struct InputPayloadInfo {
+    llvm::StringRef nodeName;   // node name
+    unsigned arrayIndex;        // array Index
+    unsigned payloadCount;      // Payload Count
+    unsigned payloadSize;       // Payload Size
+    bool trackFinishWriting;    // Track finish
+    unsigned dynamicDispatch;   // DynamicDispatch
+    unsigned nodeType;          // Node type
+    unsigned vbTableOffset;     // vertex buffer table offset
+    unsigned indexBufferOffset; // index buffer table offset
+  };
+
+public:
+  LowerExecutionGraph(lgc::Pipeline *pipeline);
+  llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
+  static llvm::StringRef name() { return "Lower SPIR-V execution graph node shader"; }
+
+private:
+  void initAllocVariables(lgc::Builder *builder);
+  typedef void (LowerExecutionGraph::*LibraryFuncPtr)(llvm::Function *, unsigned);
+  llvm::Type *getShaderStateTy();
+  llvm::Type *getOutputRecordsTy();
+  void getFuncRets(llvm::Function *func, llvm::SmallVector<llvm::Instruction *, 4> &rets);
+  void lowerGlobals(unsigned enqueueMetaId, unsigned inoutMetaId);
+  void processBuiltinGlobals(llvm::GlobalVariable *global, llvm::MDNode *mdata);
+  void buildExecGraphNodeMetadata(const lgc::wg::ShaderEnqueueMode &enqueueModes, const InputPayloadInfo &payloads);
+  void initInputPayloadInfo(const lgc::wg::ShaderEnqueueMode &enqueueModes);
+  llvm::GlobalVariable *getInputPayload(unsigned enqueueMetaId);
+  void createGraphLds(unsigned outputCount);
+  unsigned getOutputIndex(unsigned payloadId);
+  void visitIndexPayloadArray(lgc::wg::IndexPayloadArrayOp &inst);
+  void visitAllocateNodePayloads(lgc::wg::AllocateNodePayloadsOp &inst);
+  void visitRegisterOutputNode(lgc::wg::RegisterOutputNodeOp &inst);
+  void visitEnqueueNodePayloads(lgc::wg::EnqueueNodePayloadsOp &inst);
+  void visitPayloadArrayLength(lgc::wg::PayloadArrayLengthOp &inst);
+  void visitIsNodePayloadValid(lgc::wg::IsNodePayloadValidOp &inst);
+  void visitFinishWritingNodePayload(lgc::wg::FinishWritingNodePayloadOp &inst);
+  void visitGraphGetLds(lgc::wg::GraphGetLdsOp &inst);
+  void visitOutputCount(lgc::wg::OutputCountOp &inst);
+  llvm_dialects::VisitorResult visitLoad(LoadInst &load);
+  llvm_dialects::VisitorResult visitAlloca(AllocaInst &alloca);
+  llvm_dialects::VisitorResult visitStore(StoreInst &store);
+  llvm_dialects::VisitorResult visitGetElementPtr(GetElementPtrInst &gep);
+  Type *replacePayloadType(Type *ty);
+  bool isThreadLaunchNode(const lgc::ComputeShaderMode &shaderMode, const lgc::wg::ShaderEnqueueMode &enqueueModes,
+                          const InputPayloadInfo &payloads);
+  std::array<llvm::Value *, 5> m_outputAllocateArgs;
+  llvm::Value *m_tempVariable;
+  llvm::GlobalVariable *m_localInvocationIndex;                       // Built-in variable
+  llvm::GlobalVariable *m_builtInVariables[WorkGraphBuiltIns::Count]; // Built-in variable
+  llvm::SmallSet<llvm::Function *, 4> m_funcsToLower;                 // Function to lower
+  llvm::MapVector<llvm::StringRef, OutputPayloadInfo> m_nodeNamesIdx; // Node names
+  llvm::DenseMap<llvm::StringRef, unsigned> m_workGraphLibFuncNames;  // Workgraph library functions names
+  llvm::SmallVector<llvm::Function *> m_graphLibFuncs;                // Workgraph library
+  llvm::Type *m_payloadArrayPtrType = nullptr;
+  CompilerUtils::TypeLowering *m_typeLowering = nullptr;
+  lgc::wg::ShaderEnqueueMode m_enqueueModes;
+  std::string m_inputSharedWithName;
+
+  unsigned m_metaEnqueueId;            // Shader enqueue meta id
+  lgc::Pipeline *m_pipeline;           // Pipeline State
+  InputPayloadInfo m_inputPayloadInfo; // Input payload info
+  llvm::GlobalVariable *m_graphLds;    // Graph Lds variable
+  bool m_threadLaunch;                 // Enable ThreadLaunch mode or not
+};
+} // namespace Llpc
diff --git a/llpc/lower/LowerGLCompatibility.cpp b/llpc/lowering/LowerGlCompatibility.cpp
similarity index 95%
rename from llpc/lower/LowerGLCompatibility.cpp
rename to llpc/lowering/LowerGlCompatibility.cpp
index a10f5bcd36..179812a755 100644
--- a/llpc/lower/LowerGLCompatibility.cpp
+++ b/llpc/lowering/LowerGlCompatibility.cpp
@@ -24,20 +24,20 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  LowerGLCompatibility.cpp
- * @brief LLPC source file: contains implementation of class Llpc::LowerGLCompatibility.
+ * @file  LowerGlCompatibility.cpp
+ * @brief LLPC source file: contains implementation of class Llpc::LowerGlCompatibility.
  ***********************************************************************************************************************
  */
-#include "LowerGLCompatibility.h"
+#include "LowerGlCompatibility.h"
+#include "LoweringUtil.h"
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
 #include "llpcGraphicsContext.h"
-#include "llpcSpirvLowerUtil.h"
 #include "lgc/Builder.h"
 #include "lgc/Pipeline.h"
 #include "llvm/IR/DerivedTypes.h"
 
-#define DEBUG_TYPE "llpc-spirv-lower-gl-compatibility"
+#define DEBUG_TYPE "lower-gl-compatibility"
 
 using namespace llvm;
 using namespace Llpc;
@@ -45,7 +45,7 @@ using namespace Llpc;
 namespace Llpc {
 
 // =====================================================================================================================
-LowerGLCompatibility::LowerGLCompatibility()
+LowerGlCompatibility::LowerGlCompatibility()
     : m_retInst(nullptr), m_entryPointEnd(nullptr), m_originalEntryBlock(nullptr), m_out(nullptr),
       m_clipVertex(nullptr), m_clipDistance(nullptr), m_clipPlane(nullptr), m_frontColor(nullptr), m_backColor(nullptr),
       m_frontSecondaryColor(nullptr), m_backSecondaryColor(nullptr), m_color(nullptr), m_secondaryColor(nullptr),
@@ -57,9 +57,9 @@ LowerGLCompatibility::LowerGLCompatibility()
 //
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-PreservedAnalyses LowerGLCompatibility::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses LowerGlCompatibility::run(Module &module, ModuleAnalysisManager &analysisManager) {
   SpirvLower::init(&module);
-  LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-gl-compatibility\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-gl-compatibility\n");
 
   if (!needRun())
     return PreservedAnalyses::all();
@@ -113,7 +113,7 @@ PreservedAnalyses LowerGLCompatibility::run(Module &module, ModuleAnalysisManage
 
 // =====================================================================================================================
 // Use to check whether need run the pass.
-bool LowerGLCompatibility::needRun() {
+bool LowerGlCompatibility::needRun() {
   bool result = false;
   if (m_context->getPipelineType() == PipelineType::Graphics) {
     auto moduleData =
@@ -146,7 +146,7 @@ bool LowerGLCompatibility::needRun() {
 // Get location in meta data, if the global variable is UniformConstant.
 //
 // @param [in] var : Global variable to get uniform constant location
-unsigned LowerGLCompatibility::getUniformLocation(llvm::GlobalVariable *var) {
+unsigned LowerGlCompatibility::getUniformLocation(llvm::GlobalVariable *var) {
   assert(var->getType()->getAddressSpace() == SPIRAS_Uniform && var->hasMetadata(gSPIRVMD::UniformConstant));
   MDNode *metaNode = var->getMetadata(gSPIRVMD::UniformConstant);
   return mdconst::extract<ConstantInt>(metaNode->getOperand(3))->getZExtValue();
@@ -159,7 +159,7 @@ unsigned LowerGLCompatibility::getUniformLocation(llvm::GlobalVariable *var) {
 // @param [in]  mds     : The metadata constant of InOut Global variable to be decode.
 // @param [in]  index   : The the index of the metadata in the embellish type.
 // @param [out] out     : Use to output the element's metadatas of the InOut Global variable.
-void LowerGLCompatibility::decodeInOutMetaRecursivelyByIndex(llvm::Type *valueTy, llvm::Constant *mds,
+void LowerGlCompatibility::decodeInOutMetaRecursivelyByIndex(llvm::Type *valueTy, llvm::Constant *mds,
                                                              ArrayRef<Value *> index,
                                                              llvm::SmallVector<ShaderInOutMetadata> &out) {
   auto currentType = valueTy;
@@ -204,7 +204,7 @@ void LowerGLCompatibility::decodeInOutMetaRecursivelyByIndex(llvm::Type *valueTy
 // @param [in]  valueTy : The metadata's embellish type.
 // @param [in]  mds     : The metadata constant of InOut Global variable to be decode.
 // @param [out] out     : Use to output the element's metadatas of the InOut Global variable.
-void LowerGLCompatibility::decodeInOutMetaRecursively(llvm::Type *valueTy, llvm::Constant *mds,
+void LowerGlCompatibility::decodeInOutMetaRecursively(llvm::Type *valueTy, llvm::Constant *mds,
                                                       llvm::SmallVector<ShaderInOutMetadata> &out) {
   ShaderInOutMetadata md = {};
   if (valueTy->isSingleValueType()) {
@@ -236,7 +236,7 @@ void LowerGLCompatibility::decodeInOutMetaRecursively(llvm::Type *valueTy, llvm:
 // Collect "Return" instructions and replace those instructions with a branch instruction point to "ReturnBlock".
 //
 // @param [in]  func : The entry function of the shader module.
-void LowerGLCompatibility::unifyFunctionReturn(Function *func) {
+void LowerGlCompatibility::unifyFunctionReturn(Function *func) {
   SmallVector<ReturnInst *> retInsts;
   for (BasicBlock &block : *func) {
     Instruction *terminator = block.getTerminator();
@@ -263,7 +263,7 @@ void LowerGLCompatibility::unifyFunctionReturn(Function *func) {
 
 // =====================================================================================================================
 // Collect "EmitCall" instructions in the shader module.
-void LowerGLCompatibility::collectEmitInst() {
+void LowerGlCompatibility::collectEmitInst() {
   for (Function &function : m_module->functions()) {
     auto mangledName = function.getName();
     // We get all users before iterating because the iterator can be invalidated
@@ -281,7 +281,7 @@ void LowerGLCompatibility::collectEmitInst() {
 
 // =====================================================================================================================
 // Build resource may used in compatibility emulation.
-void LowerGLCompatibility::collectEmulationResource() {
+void LowerGlCompatibility::collectEmulationResource() {
   // Collect emulation information.
   for (auto &global : m_module->globals()) {
     if (global.getType()->getAddressSpace() == SPIRAS_Uniform && global.hasMetadata(gSPIRVMD::UniformConstant)) {
@@ -455,7 +455,7 @@ void LowerGLCompatibility::collectEmulationResource() {
 
 // =====================================================================================================================
 // Acquire the patch pointer for do lower, function unifyFunctionReturn may cause IR change.
-void LowerGLCompatibility::buildPatchPositionInfo() {
+void LowerGlCompatibility::buildPatchPositionInfo() {
   if (m_shaderStage == ShaderStageGeometry)
     collectEmitInst();
   else
@@ -476,44 +476,44 @@ void LowerGLCompatibility::buildPatchPositionInfo() {
 
 // =====================================================================================================================
 // Check whether need do lower for ClipVertex.
-bool LowerGLCompatibility::needLowerClipVertex() {
+bool LowerGlCompatibility::needLowerClipVertex() {
   return (m_clipVertex != nullptr && !m_clipVertex->user_empty());
 }
 
 // =====================================================================================================================
 // Check whether need do lower for FrontColor.
-bool LowerGLCompatibility::needLowerFrontColor() {
+bool LowerGlCompatibility::needLowerFrontColor() {
   return (m_frontColor != nullptr && !m_frontColor->user_empty());
 }
 
 // =====================================================================================================================
 // Check whether need do lower for BackColor.
-bool LowerGLCompatibility::needLowerBackColor() {
+bool LowerGlCompatibility::needLowerBackColor() {
   return (m_backColor != nullptr && !m_backColor->user_empty());
 }
 
 // =====================================================================================================================
 // Check whether need do lower for FrontSecondaryColor.
-bool LowerGLCompatibility::needLowerFrontSecondaryColor() {
+bool LowerGlCompatibility::needLowerFrontSecondaryColor() {
   return (m_frontSecondaryColor != nullptr && !m_frontSecondaryColor->user_empty());
 }
 
 // =====================================================================================================================
 // Check whether need do lower for BackSecondaryColor.
-bool LowerGLCompatibility::needLowerBackSecondaryColor() {
+bool LowerGlCompatibility::needLowerBackSecondaryColor() {
   return (m_backSecondaryColor != nullptr && !m_backSecondaryColor->user_empty());
 }
 
 // =====================================================================================================================
 // Check whether need do emulate for draw pixels.
-bool LowerGLCompatibility::needEmulateDrawPixels() {
+bool LowerGlCompatibility::needEmulateDrawPixels() {
   auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
   return (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.drawPixelsType != Vkgc::DrawPixelsTypeNone);
 }
 
 // =====================================================================================================================
 // Check whether need do emulate for two-side lighting.
-bool LowerGLCompatibility::needEmulateTwoSideLighting() {
+bool LowerGlCompatibility::needEmulateTwoSideLighting() {
   auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
   return (m_shaderStage == ShaderStageFragment) && buildInfo->glState.enableTwoSideLighting &&
          (m_color != nullptr || m_secondaryColor != nullptr);
@@ -521,14 +521,14 @@ bool LowerGLCompatibility::needEmulateTwoSideLighting() {
 
 // =====================================================================================================================
 // Check whether need do emulate for bitmap.
-bool LowerGLCompatibility::needEmulateBitmap() {
+bool LowerGlCompatibility::needEmulateBitmap() {
   auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
   return (m_shaderStage == ShaderStageFragment) && buildInfo->glState.enableBitmap;
 }
 
 // =====================================================================================================================
 // Check whether need do emulate point/line smooth and line/polygon stipple.
-bool LowerGLCompatibility::needEmulateSmoothStipple() {
+bool LowerGlCompatibility::needEmulateSmoothStipple() {
   auto options = m_context->getPipelineContext()->getPipelineOptions();
   return (m_shaderStage == ShaderStageFragment) &&
          (options->getGlState().enablePolygonStipple || options->getGlState().enableLineSmooth ||
@@ -537,14 +537,14 @@ bool LowerGLCompatibility::needEmulateSmoothStipple() {
 
 // =====================================================================================================================
 // Check whether need do clamp fs
-bool LowerGLCompatibility::needLowerFragColor() {
+bool LowerGlCompatibility::needLowerFragColor() {
   auto buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
   return m_fragColor && (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.enableColorClampFs);
 }
 
 // =====================================================================================================================
 // Check whether need do alphaTest.
-bool LowerGLCompatibility::needLowerAlphaTest() {
+bool LowerGlCompatibility::needLowerAlphaTest() {
   auto buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
   return (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.alphaTestFunc != Vkgc::AlphaTestFunc::Always);
 }
@@ -553,7 +553,7 @@ bool LowerGLCompatibility::needLowerAlphaTest() {
 // Create InOut global variable Metadata.
 //
 // @param [in] md : The base information of the in/out meta date.
-MDTuple *LowerGLCompatibility::createInOutMd(const ShaderInOutMetadata &md) {
+MDTuple *LowerGlCompatibility::createInOutMd(const ShaderInOutMetadata &md) {
   auto int64Type = m_builder->getInt64Ty();
   // Built metadata for the array element
   std::vector<Constant *> mdValues;
@@ -575,7 +575,7 @@ MDTuple *LowerGLCompatibility::createInOutMd(const ShaderInOutMetadata &md) {
 // Create builtin InOut global variable Metadata.
 //
 // @param [in] builtIn : The built-in kind of the in/out meta date.
-MDTuple *LowerGLCompatibility::createBuiltInInOutMd(lgc::BuiltInKind builtIn) {
+MDTuple *LowerGlCompatibility::createBuiltInInOutMd(lgc::BuiltInKind builtIn) {
   ShaderInOutMetadata inOutMd = {};
   inOutMd.IsBuiltIn = true;
   inOutMd.Value = builtIn;
@@ -584,7 +584,7 @@ MDTuple *LowerGLCompatibility::createBuiltInInOutMd(lgc::BuiltInKind builtIn) {
 
 // =====================================================================================================================
 // Create the SPIR-V output builtin variable "gl_ClipDistance".
-void LowerGLCompatibility::createClipDistance() {
+void LowerGlCompatibility::createClipDistance() {
   assert(m_clipDistance == nullptr);
   auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
   uint32_t indexOfLastClipPlane = 0;
@@ -638,7 +638,7 @@ void LowerGLCompatibility::createClipDistance() {
 
 // =====================================================================================================================
 // Create the GLSL builtin variable "gl_ClipPlane".
-void LowerGLCompatibility::createClipPlane() {
+void LowerGlCompatibility::createClipPlane() {
   auto floatType = m_builder->getFloatTy();
   auto vec4Type = FixedVectorType::get(floatType, 4);
   auto clipPlaneType = ArrayType::get(vec4Type, 8);
@@ -667,7 +667,7 @@ void LowerGLCompatibility::createClipPlane() {
 
 // =====================================================================================================================
 // Create the GLSL builtin variable "gl_BackColor".
-void LowerGLCompatibility::createBackColor() {
+void LowerGlCompatibility::createBackColor() {
   auto vec4Type = FixedVectorType::get(m_builder->getFloatTy(), 4);
   auto backColor = new GlobalVariable(*m_module, vec4Type, false, GlobalValue::ExternalLinkage, nullptr, "gl_BackColor",
                                       nullptr, GlobalVariable::GeneralDynamicTLSModel, SPIRV::SPIRAS_Input);
@@ -682,7 +682,7 @@ void LowerGLCompatibility::createBackColor() {
 
 // =====================================================================================================================
 // Create the GLSL builtin variable "gl_BackSecondaryColor".
-void LowerGLCompatibility::createBackSecondaryColor() {
+void LowerGlCompatibility::createBackSecondaryColor() {
   auto vec4Type = FixedVectorType::get(m_builder->getFloatTy(), 4);
   auto backSecondaryColor =
       new GlobalVariable(*m_module, vec4Type, false, GlobalValue::ExternalLinkage, nullptr, "gl_BackSecondaryColor",
@@ -698,7 +698,7 @@ void LowerGLCompatibility::createBackSecondaryColor() {
 
 // =====================================================================================================================
 // Create the GLSL builtin variable "gl_FrontFacing".
-void LowerGLCompatibility::createFrontFacing() {
+void LowerGlCompatibility::createFrontFacing() {
   assert(m_frontFacing == nullptr);
   auto frontFacing =
       new GlobalVariable(*m_module, m_builder->getInt1Ty(), false, GlobalValue::ExternalLinkage, nullptr,
@@ -709,7 +709,7 @@ void LowerGLCompatibility::createFrontFacing() {
 
 // =====================================================================================================================
 // Create the ARB builtin variable "patchTexCoord".
-void LowerGLCompatibility::createPatchTexCoord() {
+void LowerGlCompatibility::createPatchTexCoord() {
   auto vec2Type = FixedVectorType::get(m_builder->getFloatTy(), 2);
   auto patchTexCoord =
       new GlobalVariable(*m_module, vec2Type, false, GlobalValue::ExternalLinkage, nullptr, "patchTexCoord", nullptr,
@@ -725,7 +725,7 @@ void LowerGLCompatibility::createPatchTexCoord() {
 
 // =====================================================================================================================
 // Create the GLSL builtin variable "gl_FragDepth".
-void LowerGLCompatibility::createFragDepth() {
+void LowerGlCompatibility::createFragDepth() {
   assert(m_fragDepth == nullptr);
   auto fragDepth =
       new GlobalVariable(*m_module, m_builder->getFloatTy(), false, GlobalValue::ExternalLinkage, nullptr,
@@ -736,7 +736,7 @@ void LowerGLCompatibility::createFragDepth() {
 
 // =====================================================================================================================
 // Create the GLSL builtin variable "gl_fragStencilRef".
-void LowerGLCompatibility::createFragStencilRef() {
+void LowerGlCompatibility::createFragStencilRef() {
   assert(m_fragStencilRef == nullptr);
   auto fragStencilRef =
       new GlobalVariable(*m_module, m_builder->getInt32Ty(), false, GlobalValue::ExternalLinkage, nullptr,
@@ -747,7 +747,7 @@ void LowerGLCompatibility::createFragStencilRef() {
 
 // =====================================================================================================================
 // Inline the emulation instruction of clip vertex.
-void LowerGLCompatibility::emulateStoreClipVertex() {
+void LowerGlCompatibility::emulateStoreClipVertex() {
   auto floatType = m_builder->getFloatTy();
   Type *vec4Type = VectorType::get(floatType, 4, false);
   // Load clipVertex
@@ -776,7 +776,7 @@ void LowerGLCompatibility::emulateStoreClipVertex() {
 // Inline the emulation instruction of front/back/front secondary/back secondary color.
 //
 // @param [in] color : One of front/back/front secondary/back secondary color.
-void LowerGLCompatibility::emulationOutputColor(llvm::User *color) {
+void LowerGlCompatibility::emulationOutputColor(llvm::User *color) {
   auto floatType = m_builder->getFloatTy();
   Type *vec4Type = VectorType::get(floatType, 4, false);
   // Load frontColor
@@ -793,7 +793,7 @@ void LowerGLCompatibility::emulationOutputColor(llvm::User *color) {
 
 // =====================================================================================================================
 // Emulate for draw pixels emulation.
-void LowerGLCompatibility::emulateDrawPixels() {
+void LowerGlCompatibility::emulateDrawPixels() {
   m_builder->SetInsertPoint(m_entryPoint->getEntryBlock().begin());
   auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
   auto floatType = m_builder->getFloatTy();
@@ -854,7 +854,7 @@ void LowerGLCompatibility::emulateDrawPixels() {
 
 // =====================================================================================================================
 // Emulate for two-side lighting.
-void LowerGLCompatibility::emulateTwoSideLighting() {
+void LowerGlCompatibility::emulateTwoSideLighting() {
   auto vec4Type = FixedVectorType::get(m_builder->getFloatTy(), 4);
   if (m_shaderStage == ShaderStageFragment) {
     m_builder->SetInsertPoint(m_entryPoint->getEntryBlock().begin());
@@ -886,7 +886,7 @@ void LowerGLCompatibility::emulateTwoSideLighting() {
 
 // =====================================================================================================================
 // Emulate for bitmap emulation.
-void LowerGLCompatibility::emulateBitmap() {
+void LowerGlCompatibility::emulateBitmap() {
   auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
   m_builder->SetInsertPoint(m_entryPoint->getEntryBlock().begin());
   auto floatType = m_builder->getFloatTy();
@@ -924,7 +924,7 @@ void LowerGLCompatibility::emulateBitmap() {
 // @param [in] valTy         : current input value's type, should be global's valueType in top-level.
 // @param [in] metaVal       : metadata value of current output variable.
 // @param [in] alphaScaleVal : calculated alpha scaling results, default value is one.
-void LowerGLCompatibility::patchAlphaScaling(Value *val, Type *valTy, Constant *metaVal, Value *alphaScaleVal) {
+void LowerGlCompatibility::patchAlphaScaling(Value *val, Type *valTy, Constant *metaVal, Value *alphaScaleVal) {
   ShaderInOutMetadata outputMeta = {};
 
   if (valTy->isArrayTy()) {
@@ -965,7 +965,7 @@ void LowerGLCompatibility::patchAlphaScaling(Value *val, Type *valTy, Constant *
 
 // =====================================================================================================================
 // Emulate for point/line smooth and line/polygon stipple.
-void LowerGLCompatibility::emulateSmoothStipple() {
+void LowerGlCompatibility::emulateSmoothStipple() {
   auto options = m_context->getPipelineContext()->getPipelineOptions();
   auto pipelineBuildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
   bool needYInvert = pipelineBuildInfo->getGlState().originUpperLeft;
@@ -1134,7 +1134,7 @@ void LowerGLCompatibility::emulateSmoothStipple() {
 
 // =====================================================================================================================
 // Does lowering operations for GLSL variable "gl_ClipVertex".
-void LowerGLCompatibility::lowerClipVertex() {
+void LowerGlCompatibility::lowerClipVertex() {
   if (m_clipPlane == nullptr)
     createClipPlane();
   if (m_clipDistance == nullptr)
@@ -1158,7 +1158,7 @@ void LowerGLCompatibility::lowerClipVertex() {
 // "gl_BackSecondaryColor".
 //
 // @param [in] color : One of gl_FrontColor/gl_BackColor/gl_FrontSecondaryColor/gl_BackSecondaryColor.
-void LowerGLCompatibility::lowerColor(llvm::User *color) {
+void LowerGlCompatibility::lowerColor(llvm::User *color) {
   if (m_shaderStage == ShaderStageVertex || m_shaderStage == ShaderStageTessControl ||
       m_shaderStage == ShaderStageTessEval || m_shaderStage == ShaderStageFragment) {
     assert(m_retInst != nullptr);
@@ -1174,37 +1174,37 @@ void LowerGLCompatibility::lowerColor(llvm::User *color) {
 
 // =====================================================================================================================
 // Does lowering operations for GLSL variable "gl_FrontColor".
-void LowerGLCompatibility::lowerFrontColor() {
+void LowerGlCompatibility::lowerFrontColor() {
   lowerColor(m_frontColor);
 }
 
 // =====================================================================================================================
 // Does lowering operations for GLSL variable "gl_BackColor".
-void LowerGLCompatibility::lowerBackColor() {
+void LowerGlCompatibility::lowerBackColor() {
   lowerColor(m_backColor);
 }
 
 // =====================================================================================================================
 // Does lowering operations for GLSL variable "gl_FrontSecondaryColor".
-void LowerGLCompatibility::lowerFrontSecondaryColor() {
+void LowerGlCompatibility::lowerFrontSecondaryColor() {
   lowerColor(m_frontSecondaryColor);
 }
 
 // =====================================================================================================================
 // Does lowering operations for GLSL variable "gl_BackSecondaryColor".
-void LowerGLCompatibility::lowerBackSecondaryColor() {
+void LowerGlCompatibility::lowerBackSecondaryColor() {
   lowerColor(m_backSecondaryColor);
 }
 
 // =====================================================================================================================
 // Does clamp fragment color
-void LowerGLCompatibility::lowerFragColor() {
+void LowerGlCompatibility::lowerFragColor() {
   lowerColor(m_fragColor);
 }
 
 // =====================================================================================================================
 // Does lowering operations for alpha test.
-void LowerGLCompatibility::lowerAlphaTest() {
+void LowerGlCompatibility::lowerAlphaTest() {
   GlobalVariable *outputLocationZero = nullptr;
   auto floatTy = m_builder->getFloatTy();
   Type *vec4Type = VectorType::get(floatTy, 4, false);
@@ -1227,7 +1227,7 @@ void LowerGLCompatibility::lowerAlphaTest() {
   if (outputLocationZero != nullptr && outputLocationZero->getValueType()->isVectorTy()) {
     auto type = cast<FixedVectorType>(outputLocationZero->getValueType());
     uint32_t vectorNum = type->getNumElements();
-    if (vectorNum != 4)
+    if (vectorNum != 4 || !type->getElementType()->isFloatTy())
       return;
   } else
     return;
diff --git a/llpc/lower/LowerGLCompatibility.h b/llpc/lowering/LowerGlCompatibility.h
similarity index 95%
rename from llpc/lower/LowerGLCompatibility.h
rename to llpc/lowering/LowerGlCompatibility.h
index a23c612c8e..6bfd8e2513 100644
--- a/llpc/lower/LowerGLCompatibility.h
+++ b/llpc/lowering/LowerGlCompatibility.h
@@ -1,7 +1,7 @@
 /*
  ***********************************************************************************************************************
  *
- *  Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
  *
  *  Permission is hereby granted, free of charge, to any person obtaining a copy
  *  of this software and associated documentation files (the "Software"), to
@@ -24,14 +24,14 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  LowerGLCompatibility.h
- * @brief LLPC header file: contains declaration of Llpc::LowerGLCompatibility
+ * @file  LowerGlCompatibility.h
+ * @brief LLPC header file: contains declaration of Llpc::LowerGlCompatibility
  ***********************************************************************************************************************
  */
 #pragma once
 
+#include "Lowering.h"
 #include "SPIRVInternal.h"
-#include "llpcSpirvLower.h"
 #include "lgc/Builder.h"
 #include "lgc/LgcDialect.h"
 #include "llvm/IR/PassManager.h"
@@ -41,9 +41,9 @@ namespace Llpc {
 
 // =====================================================================================================================
 // Represents the pass of SPIR-V lowering ray query post inline.
-class LowerGLCompatibility : public SpirvLower, public llvm::PassInfoMixin<LowerGLCompatibility> {
+class LowerGlCompatibility : public SpirvLower, public llvm::PassInfoMixin<LowerGlCompatibility> {
 public:
-  LowerGLCompatibility();
+  LowerGlCompatibility();
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
   static llvm::StringRef name() { return "Lower GLSL compatibility variables and operations"; }
 
diff --git a/llpc/lower/LowerGlobals.cpp b/llpc/lowering/LowerGlobals.cpp
similarity index 96%
rename from llpc/lower/LowerGlobals.cpp
rename to llpc/lowering/LowerGlobals.cpp
index 22cc6c806a..475fae5df8 100644
--- a/llpc/lower/LowerGlobals.cpp
+++ b/llpc/lowering/LowerGlobals.cpp
@@ -29,12 +29,12 @@
  ***********************************************************************************************************************
  */
 #include "LowerGlobals.h"
+#include "LoweringUtil.h"
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
 #include "llpcDebug.h"
 #include "llpcGraphicsContext.h"
 #include "llpcRayTracingContext.h"
-#include "llpcSpirvLowerUtil.h"
 #include "compilerutils/CompilerUtils.h"
 #include "compilerutils/TypesMetadata.h"
 #include "lgc/LgcDialect.h"
@@ -1963,6 +1963,79 @@ void LowerGlobals::lowerUniformConstants() {
       globalUsers[inst->getFunction()].push_back(inst);
     }
 
+    // Replace uniform constant variable with a compile time constants if it is set from driver side.
+    Vkgc::CompileConstInfo *compileTimeConstsInfo =
+        m_context->getPipelineContext()->getPipelineOptions()->compileConstInfo;
+
+    if (compileTimeConstsInfo && compileTimeConstsInfo->numCompileTimeConstants > 0) {
+      GlobalVariable *compileTimeConstVal = nullptr;
+      bool foundGlobal = false;
+      MDNode *metaNode = global.getMetadata(gSPIRVMD::UniformConstant);
+      auto uniformConstantsSet = mdconst::extract<ConstantInt>(metaNode->getOperand(0))->getZExtValue();
+      auto uniformConstantsBinding = mdconst::extract<ConstantInt>(metaNode->getOperand(1))->getZExtValue();
+      auto uniformConstantsOffset = mdconst::dyn_extract<ConstantInt>(metaNode->getOperand(2))->getZExtValue();
+      for (uint32_t i = 0; i < compileTimeConstsInfo->numCompileTimeConstants; i++) {
+        auto specializeUniformInfo = compileTimeConstsInfo->pCompileTimeConstants[i];
+        if (specializeUniformInfo.offset == uniformConstantsOffset &&
+            specializeUniformInfo.set == uniformConstantsSet &&
+            specializeUniformInfo.binding == uniformConstantsBinding) {
+          // determine result constant type.
+          foundGlobal = true;
+          uint32_t uniformChannelCount = 1;
+          uint32_t uniformChannelBytesCount = 1;
+          Type *uniformTy = global.getValueType();
+          assert(!uniformTy->isStructTy());
+          Type *constTy = uniformTy;
+
+          if (auto *vectorUniformTy = dyn_cast<FixedVectorType>(uniformTy)) {
+            constTy = vectorUniformTy->getElementType();
+            uniformChannelCount = vectorUniformTy->getElementCount().getFixedValue();
+          }
+          uniformChannelBytesCount = constTy->getScalarSizeInBits() / (sizeof(uint8_t) * 8);
+
+          if (uniformChannelBytesCount * uniformChannelCount != specializeUniformInfo.validBytes) {
+            // Don't support partial replacement now (like vector component partial replacement).
+            continue;
+          }
+
+          // Construct constants
+          Constant *constData[16] = {};
+          compileTimeConstVal = new GlobalVariable(uniformTy, true, global.getLinkage(), nullptr, "",
+                                                   GlobalValue::NotThreadLocal, SPIRAS_Private);
+          for (uint32_t i = 0; i < uniformChannelCount; i++) {
+            if (uniformTy->isFloatingPointTy()) {
+              double data = 0.0;
+              memcpy(&data, specializeUniformInfo.values.u8 + i * uniformChannelBytesCount, uniformChannelBytesCount);
+              constData[i] = ConstantFP::get(constTy, data);
+            } else {
+              uint64_t data = 0;
+              memcpy(&data, specializeUniformInfo.values.u8 + i * uniformChannelBytesCount, uniformChannelBytesCount);
+              constData[i] = ConstantInt::get(constTy, data);
+            }
+          }
+
+          // Replace current uniform with known compile time constants.
+          Constant *initializer = uniformChannelCount > 1 ? ConstantVector::get(constData) : constData[0];
+          compileTimeConstVal->setInitializer(initializer);
+          for (auto &eachFunc : globalUsers) {
+            for (auto *inst : eachFunc.second) {
+              inst->replaceUsesOfWith(&global, compileTimeConstVal);
+            }
+          }
+
+          // Insert new global to list and remove replaced global variable.
+          global.getParent()->insertGlobalVariable(compileTimeConstVal);
+          mapGlobalVariableToProxy(compileTimeConstVal);
+          globalsToRemove.push_back(&global);
+          break;
+        }
+      }
+      // If replacement happens, skip following buffer load convert.
+      if (foundGlobal) {
+        continue;
+      }
+    }
+
     for (auto &eachFunc : globalUsers) {
       MDNode *metaNode = global.getMetadata(gSPIRVMD::UniformConstant);
       auto uniformConstantsSet = mdconst::extract<ConstantInt>(metaNode->getOperand(0))->getZExtValue();
diff --git a/llpc/lower/LowerGlobals.h b/llpc/lowering/LowerGlobals.h
similarity index 99%
rename from llpc/lower/LowerGlobals.h
rename to llpc/lowering/LowerGlobals.h
index 86ef63bc06..f4fc2c5b3d 100644
--- a/llpc/lower/LowerGlobals.h
+++ b/llpc/lowering/LowerGlobals.h
@@ -30,8 +30,8 @@
  */
 #pragma once
 
+#include "Lowering.h"
 #include "SPIRVInternal.h"
-#include "llpcSpirvLower.h"
 #include "vkgcDefs.h"
 #include "lgc/Builder.h"
 #include "llvm/IR/IRBuilder.h"
diff --git a/llpc/lowering/LowerGraphLibrary.cpp b/llpc/lowering/LowerGraphLibrary.cpp
new file mode 100644
index 0000000000..aa0672051c
--- /dev/null
+++ b/llpc/lowering/LowerGraphLibrary.cpp
@@ -0,0 +1,268 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  LowerGraphLibrary.cpp
+ * @brief LLPC source file: contains implementation of class Llpc::LowerGraphLibrary.
+ ***********************************************************************************************************************
+ */
+
+#include "LowerGraphLibrary.h"
+#include "LowerInternalLibraryIntrinsic.h"
+#include "SPIRVInternal.h"
+#include "lgc/Builder.h"
+#include "lgc/BuiltIns.h"
+#include "lgc/LgcWgDialect.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+
+#define DEBUG_TYPE "lower-graph-library"
+
+using namespace llvm;
+using namespace Llpc;
+using namespace lgc;
+extern const char *WorkGraphNames[];
+constexpr unsigned WorkGraphFuncCount = 16;
+
+namespace AmdExtFunc {
+enum : unsigned {
+  BackingStore = 0,  // Backing store
+  ShaderDirectory,   // Shader Directory
+  NodeDispatchInfo1, // Node Dispatch Info1
+  NodeDispatchInfo2, // Node Dispatch Info2
+  TraceBuffer,       // Trace Buffer
+  LdsLoadDword,      // Lds load dword
+  LdsStoreDword,     // Lds store dword
+  LdsAtomicAddDword, // Lds atomic add
+  OutputCount,       // Lds output count
+  Count
+};
+}
+
+static const char *AmdExtNames[] = {
+    "AmdWorkGraphsBackingStore",      "AmdWorkGraphsShaderDirectory",   "AmdWorkGraphsNodeDispatchInfo1",
+    "AmdWorkGraphsNodeDispatchInfo2", "AmdWorkGraphsTraceBuffer",       "AmdWorkGraphsLdsLoadDword",
+    "AmdWorkGraphsLdsStoreDword",     "AmdWorkGraphsLdsAtomicAddDword", "AmdWorkGraphsOutputCount"};
+
+// =====================================================================================================================
+LowerGraphLibrary::LowerGraphLibrary() {
+  for (unsigned i = 0; i < AmdExtFunc::Count; ++i) {
+    m_extFuncNames[AmdExtNames[i]] = i;
+  }
+  for (unsigned i = 0; i < WorkGraphFuncCount; ++i)
+    m_workgraphNames.insert(WorkGraphNames[i]);
+}
+
+// =====================================================================================================================
+// Executes this LLVM patching pass on the specified LLVM module.
+//
+// @param [in/out] module : LLVM module to be run on
+// @param [in/out] analysisManager : Analysis manager to use for this transformation
+PreservedAnalyses LowerGraphLibrary::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-execution-graph\n");
+  SpirvLower::init(&module);
+  for (auto funcIt = module.begin(), funcEnd = module.end(); funcIt != funcEnd;) {
+    Function *func = &*funcIt++;
+    processLibraryFunction(func);
+  }
+  return PreservedAnalyses::none();
+}
+
+// =====================================================================================================================
+// Clear the block before patching the function
+//
+// @param func : The function to clear
+BasicBlock *LowerGraphLibrary::clearBlock(Function *func) {
+  assert(func->size() == 1);
+  BasicBlock &entryBlock = func->getEntryBlock();
+  for (auto instIt = entryBlock.begin(); instIt != entryBlock.end();) {
+    auto &inst = *instIt++;
+    inst.eraseFromParent();
+  }
+  return &entryBlock;
+}
+
+// =====================================================================================================================
+// Clear the block before patching the function
+//
+// @param func : The function to process
+void LowerGraphLibrary::processLibraryFunction(Function *&func) {
+  LibraryFuncPtr amdLibraryFuncs[] = {
+      &LowerGraphLibrary::createBackingStore,      &LowerGraphLibrary::createShaderDirectory,
+      &LowerGraphLibrary::createNodeDispatchInfo1, &LowerGraphLibrary::createNodeDispatchInfo2,
+      &LowerGraphLibrary::createTraceBuffer,       &LowerGraphLibrary::createLdsLoadDword,
+      &LowerGraphLibrary::createLdsStoreDword,     &LowerGraphLibrary::createLdsAtomicAddDword,
+      &LowerGraphLibrary::createOutputCount};
+
+  if (m_workgraphNames.find(func->getName()) != m_workgraphNames.end()) {
+    func->setLinkage(GlobalValue::WeakAnyLinkage);
+    return;
+  }
+  auto funcIt = m_extFuncNames.find(func->getName());
+
+  if (funcIt != m_extFuncNames.end()) {
+    auto funcIdx = funcIt->second;
+    (this->*amdLibraryFuncs[funcIdx])(func, funcIdx);
+    return;
+  }
+
+  auto &commonFuncTable = InternalLibraryIntrinsicUtil::LibraryFunctionTable::get().m_libFuncPtrs;
+  auto commonFuncIt = commonFuncTable.find(func->getName());
+  if (commonFuncIt != commonFuncTable.end()) {
+    auto funcPtr = commonFuncIt->second;
+    m_builder->SetInsertPoint(clearBlock(func));
+    (*funcPtr)(func, m_builder);
+  }
+}
+
+// =====================================================================================================================
+// Create Backing store
+//
+// @param func : The function to process
+// @param funcId : The function ID
+void LowerGraphLibrary::createBackingStore(Function *func, unsigned funcId) {
+  assert(funcId == AmdExtFunc::BackingStore);
+  m_builder->SetInsertPoint(clearBlock(func));
+  m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInGraphControlStruct));
+}
+
+// =====================================================================================================================
+// Create Shader Directory
+//
+// @param func : The function to process
+// @param funcId : The function ID
+void LowerGraphLibrary::createShaderDirectory(Function *func, unsigned funcId) {
+  assert(funcId == AmdExtFunc::ShaderDirectory);
+  m_builder->SetInsertPoint(clearBlock(func));
+  m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInShaderDirectory));
+}
+
+// =====================================================================================================================
+// Create Node Dispatch Info1
+//
+// @param func : The function to process
+// @param funcId : The function ID
+void LowerGraphLibrary::createNodeDispatchInfo1(Function *func, unsigned funcId) {
+  assert(funcId == AmdExtFunc::NodeDispatchInfo1);
+  m_builder->SetInsertPoint(clearBlock(func));
+  m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInNodeDispatchInfo1));
+}
+
+// =====================================================================================================================
+// Create Node Dispatch Info2
+//
+// @param func : The function to process
+// @param funcId : The function ID
+void LowerGraphLibrary::createNodeDispatchInfo2(Function *func, unsigned funcId) {
+  assert(funcId == AmdExtFunc::NodeDispatchInfo2);
+  m_builder->SetInsertPoint(clearBlock(func));
+  m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInNodeDispatchInfo2));
+}
+
+// =====================================================================================================================
+// Create Trace Buffer
+//
+// @param func : The function to process
+// @param funcId : The function ID
+void LowerGraphLibrary::createTraceBuffer(Function *func, unsigned funcId) {
+  assert(funcId == AmdExtFunc::TraceBuffer);
+  m_builder->SetInsertPoint(clearBlock(func));
+  m_builder->CreateRet(m_builder->CreateReadBuiltInInput(lgc::BuiltInWorkGraphTraceBuf));
+}
+
+// =====================================================================================================================
+// Create Load DWORD from lds
+//
+// @param func : The function to process
+// @param funcId : The function ID
+void LowerGraphLibrary::createLdsLoadDword(Function *func, unsigned funcId) {
+  assert(funcId == AmdExtFunc::LdsLoadDword);
+  // AmdWorkGraphsLdsLoadDword(uint offset) in byte
+  m_builder->SetInsertPoint(clearBlock(func));
+  Value *offset = func->getArg(0);
+  offset = m_builder->CreateLoad(m_builder->getInt32Ty(), offset);
+  // convert offset from BYTE to DWORD
+  offset = m_builder->CreateLShr(offset, 2);
+  auto graphLds = m_builder->create<wg::GraphGetLdsOp>();
+  auto ldsPtr = m_builder->CreateGEP(m_builder->getInt32Ty(), graphLds, {offset});
+  // Load value from lds position
+  Value *ldsValue = m_builder->CreateLoad(m_builder->getInt32Ty(), ldsPtr);
+  m_builder->CreateRet(ldsValue);
+}
+
+// =====================================================================================================================
+// Create store DWORD to lds
+//
+// @param func : The function to process
+// @param funcId : The function ID
+void LowerGraphLibrary::createLdsStoreDword(Function *func, unsigned funcId) {
+  assert(funcId == AmdExtFunc::LdsStoreDword);
+  // void AmdWorkGraphsLdsStoreDword(uint offset, uint value)
+  m_builder->SetInsertPoint(clearBlock(func));
+  Value *offset = func->getArg(0);
+  offset = m_builder->CreateLoad(m_builder->getInt32Ty(), offset);
+  // convert offset from BYTE to DWORD
+  offset = m_builder->CreateLShr(offset, 2);
+  Value *value = func->getArg(1);
+  value = m_builder->CreateLoad(m_builder->getInt32Ty(), value);
+  auto graphLds = m_builder->create<wg::GraphGetLdsOp>();
+  auto ldsPtr = m_builder->CreateGEP(m_builder->getInt32Ty(), graphLds, {offset});
+  m_builder->CreateStore(value, ldsPtr);
+  m_builder->CreateRetVoid();
+}
+
+// =====================================================================================================================
+// Create atomic add DWORD to lds
+//
+// @param func : The function to process
+// @param funcId : The function ID
+void LowerGraphLibrary::createLdsAtomicAddDword(Function *func, unsigned funcId) {
+  assert(funcId == AmdExtFunc::LdsAtomicAddDword);
+  // AmdWorkGraphsLdsAtomicAddDword(uint offset, uint value)
+  m_builder->SetInsertPoint(clearBlock(func));
+  Value *offset = func->getArg(0);
+  offset = m_builder->CreateLoad(m_builder->getInt32Ty(), offset);
+  // convert offset from BYTE to DWORD
+  offset = m_builder->CreateLShr(offset, 2);
+  Value *value = func->getArg(1);
+  value = m_builder->CreateLoad(m_builder->getInt32Ty(), value);
+  auto graphLds = m_builder->create<wg::GraphGetLdsOp>();
+  auto ldsPtr = m_builder->CreateGEP(m_builder->getInt32Ty(), graphLds, {offset});
+  m_builder->CreateAtomicRMW(AtomicRMWInst::Add, ldsPtr, value, MaybeAlign(), AtomicOrdering::Monotonic,
+                             SyncScope::System);
+  m_builder->CreateRetVoid();
+}
+
+// =====================================================================================================================
+// Create output count
+//
+// @param func : The function to process
+// @param funcId : The function ID
+void LowerGraphLibrary::createOutputCount(Function *func, unsigned funcId) {
+  assert(funcId == AmdExtFunc::OutputCount);
+  // uint AmdWorkgraphsOutputCount()
+  m_builder->SetInsertPoint(clearBlock(func));
+  auto outputCount = m_builder->create<wg::OutputCountOp>();
+  m_builder->CreateRet(outputCount);
+}
diff --git a/llpc/lowering/LowerGraphLibrary.h b/llpc/lowering/LowerGraphLibrary.h
new file mode 100644
index 0000000000..73bcba1f5d
--- /dev/null
+++ b/llpc/lowering/LowerGraphLibrary.h
@@ -0,0 +1,68 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  LowerGraphLibrary.h
+ * @brief LLPC header file: contains declaration of Llpc::LowerGraphLibrary
+ ***********************************************************************************************************************
+ */
+#pragma once
+
+#include "Lowering.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class BasicBlock;
+} // namespace llvm
+
+namespace Llpc {
+
+// =====================================================================================================================
+// Represents the pass of SPIR-V lowering graph library
+class LowerGraphLibrary : public SpirvLower, public llvm::PassInfoMixin<LowerGraphLibrary> {
+public:
+  LowerGraphLibrary();
+  llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
+  static llvm::StringRef name() { return "Lower SPIR-V library shader"; }
+
+private:
+  typedef void (LowerGraphLibrary::*LibraryFuncPtr)(llvm::Function *, unsigned);
+  void processLibraryFunction(llvm::Function *&func);
+  llvm::BasicBlock *clearBlock(llvm::Function *func);
+  void createBackingStore(llvm::Function *func, unsigned);
+  void createShaderDirectory(llvm::Function *func, unsigned);
+  void createNodeDispatchInfo1(llvm::Function *func, unsigned);
+  void createNodeDispatchInfo2(llvm::Function *func, unsigned);
+  void createTraceBuffer(llvm::Function *func, unsigned);
+  void createLdsLoadDword(llvm::Function *func, unsigned);
+  void createLdsStoreDword(llvm::Function *func, unsigned);
+  void createLdsAtomicAddDword(llvm::Function *func, unsigned);
+  void createOutputCount(llvm::Function *func, unsigned);
+  llvm::DenseSet<llvm::StringRef> m_workgraphNames;         // External linked workgraph functions
+  llvm::DenseMap<llvm::StringRef, unsigned> m_extFuncNames; // Library functions to patch
+};
+}; // namespace Llpc
diff --git a/llpc/lower/LowerInstMetaRemove.cpp b/llpc/lowering/LowerInstMetaRemove.cpp
similarity index 87%
rename from llpc/lower/LowerInstMetaRemove.cpp
rename to llpc/lowering/LowerInstMetaRemove.cpp
index 2a826d8cad..5967d25f3f 100644
--- a/llpc/lower/LowerInstMetaRemove.cpp
+++ b/llpc/lowering/LowerInstMetaRemove.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerInstMetaRemove.cpp
- * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerInstMetaRemove.
+ * @brief LLPC source file: contains implementation of class Llpc::LowerInstMetaRemove.
  ***********************************************************************************************************************
  */
 #include "LowerInstMetaRemove.h"
@@ -34,7 +34,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
-#define DEBUG_TYPE "llpc-spirv-lower-inst-meta-remove"
+#define DEBUG_TYPE "lower-inst-meta-remove"
 
 using namespace llvm;
 using namespace SPIRV;
@@ -43,7 +43,7 @@ using namespace Llpc;
 namespace Llpc {
 
 // =====================================================================================================================
-SpirvLowerInstMetaRemove::SpirvLowerInstMetaRemove() : m_changed(false) {
+LowerInstMetaRemove::LowerInstMetaRemove() {
 }
 
 // =====================================================================================================================
@@ -51,11 +51,11 @@ SpirvLowerInstMetaRemove::SpirvLowerInstMetaRemove() : m_changed(false) {
 //
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-PreservedAnalyses SpirvLowerInstMetaRemove::run(Module &module, ModuleAnalysisManager &analysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Inst-Meta-Remove\n");
+PreservedAnalyses LowerInstMetaRemove::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Inst-Meta-Remove\n");
 
   SpirvLower::init(&module);
-  m_changed = false;
+  bool changed = false;
 
   // Remove calls to functions whose names start with "spirv.NonUniform".
   SmallVector<CallInst *, 8> callsToRemove;
@@ -72,7 +72,7 @@ PreservedAnalyses SpirvLowerInstMetaRemove::run(Module &module, ModuleAnalysisMa
   for (auto *callInst : callsToRemove) {
     callInst->dropAllReferences();
     callInst->eraseFromParent();
-    m_changed = true;
+    changed = true;
   }
 
   // Remove any named metadata in the module that starts "spirv.".
@@ -83,10 +83,10 @@ PreservedAnalyses SpirvLowerInstMetaRemove::run(Module &module, ModuleAnalysisMa
   }
   for (NamedMDNode *namedMdNode : nodesToRemove) {
     namedMdNode->eraseFromParent();
-    m_changed = true;
+    changed = true;
   }
 
-  return m_changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
 } // namespace Llpc
diff --git a/llpc/lower/LowerInstMetaRemove.h b/llpc/lowering/LowerInstMetaRemove.h
similarity index 90%
rename from llpc/lower/LowerInstMetaRemove.h
rename to llpc/lowering/LowerInstMetaRemove.h
index 7a5343abe9..f8f5a85b88 100644
--- a/llpc/lower/LowerInstMetaRemove.h
+++ b/llpc/lowering/LowerInstMetaRemove.h
@@ -25,28 +25,25 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerInstMetaRemove.h
- * @brief LLPC header file: contains declaration of class Llpc::SpirvLowerInstMetaRemove.
+ * @brief LLPC header file: contains declaration of class Llpc::LowerInstMetaRemove.
  ***********************************************************************************************************************
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/IR/PassManager.h"
 
 namespace Llpc {
 
 // =====================================================================================================================
 // Represents the pass of SPIR-V lowering operations for removing the instruction metadata.
-class SpirvLowerInstMetaRemove : public SpirvLower, public llvm::PassInfoMixin<SpirvLowerInstMetaRemove> {
+class LowerInstMetaRemove : public SpirvLower, public llvm::PassInfoMixin<LowerInstMetaRemove> {
 public:
-  SpirvLowerInstMetaRemove();
+  LowerInstMetaRemove();
 
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
   static llvm::StringRef name() { return "Lower SPIR-V instruction metadata by removing those targeted"; }
-
-private:
-  bool m_changed; // Whether the module is changed
 };
 
 } // namespace Llpc
diff --git a/llpc/lower/LowerInternalLibraryIntrinsic.cpp b/llpc/lowering/LowerInternalLibraryIntrinsic.cpp
similarity index 94%
rename from llpc/lower/LowerInternalLibraryIntrinsic.cpp
rename to llpc/lowering/LowerInternalLibraryIntrinsic.cpp
index 2b184a0006..8090df71d6 100644
--- a/llpc/lower/LowerInternalLibraryIntrinsic.cpp
+++ b/llpc/lowering/LowerInternalLibraryIntrinsic.cpp
@@ -68,6 +68,26 @@ static void createHalt(Function *func, Builder *builder) {
   builder->CreateRetVoid();
 }
 
+// =====================================================================================================================
+// Create device scope memory_order_acquire
+//
+// @param func : The function to process
+// @param builder : The IR builder
+static void createDeviceMemoryAcquire(Function *func, Builder *builder) {
+  builder->CreateFence(AtomicOrdering::Acquire, builder->getContext().getOrInsertSyncScopeID("agent"));
+  builder->CreateRetVoid();
+}
+
+// =====================================================================================================================
+// Create device scope memory_order_release
+//
+// @param func : The function to process
+// @param builder : The IR builder
+static void createDeviceMemoryRelease(Function *func, Builder *builder) {
+  builder->CreateFence(AtomicOrdering::Release, builder->getContext().getOrInsertSyncScopeID("agent"));
+  builder->CreateRetVoid();
+}
+
 // =====================================================================================================================
 // Create function to compute the number of waves in the workgroup
 //
@@ -372,6 +392,8 @@ InternalLibraryIntrinsicUtil::LibraryFunctionTable::LibraryFunctionTable() {
   m_libFuncPtrs["AmdExtLaneIndex"] = &createLaneIndex;
   m_libFuncPtrs["AmdExtLaneCount"] = &createLaneCount;
   m_libFuncPtrs["AmdExtHalt"] = &createHalt;
+  m_libFuncPtrs["AmdExtDeviceMemoryAcquire"] = &createDeviceMemoryAcquire;
+  m_libFuncPtrs["AmdExtDeviceMemoryRelease"] = &createDeviceMemoryRelease;
   m_libFuncPtrs["AmdExtNumWavesCompute"] = &createNumWavesCompute;
   m_libFuncPtrs["AmdExtWaveIndexCompute"] = &createWaveIndexCompute;
   m_libFuncPtrs["AmdExtGroupIdCompute"] = &createGroupIdCompute;
diff --git a/llpc/lower/LowerInternalLibraryIntrinsic.h b/llpc/lowering/LowerInternalLibraryIntrinsic.h
similarity index 100%
rename from llpc/lower/LowerInternalLibraryIntrinsic.h
rename to llpc/lowering/LowerInternalLibraryIntrinsic.h
diff --git a/llpc/lower/LowerMath.cpp b/llpc/lowering/LowerMath.cpp
similarity index 89%
rename from llpc/lower/LowerMath.cpp
rename to llpc/lowering/LowerMath.cpp
index f5280171a4..a097d0dbc9 100644
--- a/llpc/lower/LowerMath.cpp
+++ b/llpc/lowering/LowerMath.cpp
@@ -25,15 +25,15 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerMath.cpp
- * @brief LLPC source file: implementations of Llpc::SpirvLowerMathConstFolding and Llpc::SpirvLowerMathFloatOp.
+ * @brief LLPC source file: implementations of Llpc::LowerMathConstFolding and Llpc::LowerMathFloatOp.
  ***********************************************************************************************************************
  */
 #include "LowerMath.h"
+#include "Lowering.h"
 #include "SPIRVInternal.h"
 #include "hex_float.h"
 #include "llpcContext.h"
 #include "llpcGraphicsContext.h"
-#include "llpcSpirvLower.h"
 #include "lgc/Builder.h"
 #include "lgc/Pipeline.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -46,9 +46,9 @@
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/Local.h"
 
-#define DEBUG_TYPE_CONST_FOLDING "llpc-spirv-lower-math-const-folding"
-#define DEBUG_TYPE_PRECISION "llpc-spirv-lower-math-precision"
-#define DEBUG_TYPE_FLOAT_OP "llpc-spirv-lower-math-float-op"
+#define DEBUG_TYPE_CONST_FOLDING "lower-math-const-folding"
+#define DEBUG_TYPE_PRECISION "lower-math-precision"
+#define DEBUG_TYPE_FLOAT_OP "lower-math-float-op"
 
 using namespace lgc;
 using namespace llvm;
@@ -63,9 +63,12 @@ static cl::opt<bool>
     BackwardPropagateNoContract("backward-propagate-no-contract",
                                 cl::desc("Backward propagate NoContraction decorations to input operations"),
                                 cl::init(false));
+static cl::opt<bool> DisableGlPositionOpt("disable-gl-position-opt",
+                                          cl::desc("Disable all use of fast math flags on gl_Position"),
+                                          cl::init(false));
 
 // =====================================================================================================================
-SpirvLowerMath::SpirvLowerMath()
+LowerMath::LowerMath()
     : m_changed(false), m_fp16DenormFlush(false), m_fp32DenormFlush(false), m_fp64DenormFlush(false),
       m_fp16RoundToZero(false) {
 }
@@ -88,7 +91,7 @@ static void setFpMathAttribute(Function &func, bool fp32, FpDenormMode denormMod
 // Initialise transform class.
 //
 // @param [in/out] module : LLVM module to be run on
-void SpirvLowerMath::init(Module &module) {
+void LowerMath::init(Module &module) {
   SpirvLower::init(&module);
   m_changed = false;
 
@@ -115,7 +118,7 @@ void SpirvLowerMath::init(Module &module) {
 // Checks desired denormal flush behavior and inserts llvm.canonicalize.
 //
 // @param inst : Instruction to flush denormals if needed
-void SpirvLowerMath::flushDenormIfNeeded(Instruction *inst) {
+void LowerMath::flushDenormIfNeeded(Instruction *inst) {
   auto destTy = inst->getType();
   if ((destTy->getScalarType()->isHalfTy() && m_fp16DenormFlush) ||
       (destTy->getScalarType()->isFloatTy() && m_fp32DenormFlush) ||
@@ -149,7 +152,8 @@ static bool isNoContract(Value *value) {
 // Disable fast math for all values related with the specified value
 //
 // @param value : Value to disable fast math for
-static void disableFastMath(Value *value) {
+// @param clearAll : Whether to clear all flags, including nnan and nsz
+static void disableFastMath(Value *value, bool clearAll) {
   std::set<Instruction *> allValues;
   std::list<Instruction *> workSet;
   if (isa<Instruction>(value)) {
@@ -160,10 +164,15 @@ static void disableFastMath(Value *value) {
   auto it = workSet.begin();
   while (!workSet.empty()) {
     if (isa<FPMathOperator>(*it)) {
-      // Reset fast math flags to default
+      // Reset fast math flags to default, but maintain nsz and nnan as required.
       auto inst = cast<Instruction>(*it);
-      FastMathFlags fastMathFlags;
-      inst->copyFastMathFlags(fastMathFlags);
+      FastMathFlags newFmf;
+      if (!clearAll) {
+        FastMathFlags instFmf = inst->getFastMathFlags();
+        newFmf.setNoSignedZeros(instFmf.noSignedZeros());
+        newFmf.setNoNaNs(instFmf.noNaNs());
+      }
+      inst->copyFastMathFlags(newFmf);
     }
 
     for (Value *operand : (*it)->operands()) {
@@ -188,10 +197,10 @@ static void disableFastMath(Value *value) {
 //
 // @param [in/out] module : LLVM module to be run on (empty on entry)
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-PreservedAnalyses SpirvLowerMathConstFolding::run(Module &module, ModuleAnalysisManager &analysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Math-Const-Folding\n");
+PreservedAnalyses LowerMathConstFolding::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Math-Const-Folding\n");
 
-  SpirvLowerMath::init(module);
+  LowerMath::init(module);
 
   if (m_shaderStage == ShaderStageInvalid)
     return PreservedAnalyses::all();
@@ -247,14 +256,14 @@ PreservedAnalyses SpirvLowerMathConstFolding::run(Module &module, ModuleAnalysis
 
 // =====================================================================================================================
 // Return the module entry point function.
-Function *SpirvLowerMathConstFolding::getEntryPoint() {
+Function *LowerMathConstFolding::getEntryPoint() {
   return m_entryPoint;
 }
 
 #undef DEBUG_TYPE // DEBUG_TYPE_CONST_FOLDING
 #define DEBUG_TYPE DEBUG_TYPE_PRECISION
 
-bool SpirvLowerMathPrecision::adjustExports(Module &module) {
+bool LowerMathPrecision::adjustExports(Module &module, bool disablePositionOpt) {
   bool changed = false;
   for (auto &func : module.functions()) {
     // Disable fast math for gl_Position.
@@ -282,7 +291,7 @@ bool SpirvLowerMathPrecision::adjustExports(Module &module) {
       }
 
       if (valueWritten && builtIn == lgc::BuiltInPosition) {
-        disableFastMath(valueWritten);
+        disableFastMath(valueWritten, disablePositionOpt);
         changed = true;
       }
     }
@@ -301,7 +310,7 @@ static bool clearContractFlag(Instruction *inst) {
   return true;
 }
 
-bool SpirvLowerMathPrecision::propagateNoContract(Module &module, bool forward, bool backward) {
+bool LowerMathPrecision::propagateNoContract(Module &module, bool forward, bool backward) {
   bool changed = false;
 
   SmallVector<Instruction *> roots;
@@ -373,8 +382,8 @@ bool SpirvLowerMathPrecision::propagateNoContract(Module &module, bool forward,
 //
 // @param [in/out] module : LLVM module to be run on (empty on entry)
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisManager &analysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Math-Precision\n");
+PreservedAnalyses LowerMathPrecision::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Math-Precision\n");
 
   SpirvLower::init(&module);
   if (m_shaderStage == ShaderStageInvalid)
@@ -382,18 +391,21 @@ PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisMan
 
   bool forwardPropagate = false;
   bool backwardPropagate = false;
+  bool disableGlPositionOpt = false;
   auto pipelineContext = m_context->getPipelineContext();
   switch (pipelineContext->getPipelineType()) {
   case PipelineType::Graphics: {
     auto shaderInfo = (static_cast<const GraphicsContext *>(pipelineContext))->getPipelineShaderInfo(m_shaderStage);
     forwardPropagate = forwardPropagate || shaderInfo->options.forwardPropagateNoContract;
     backwardPropagate = backwardPropagate || shaderInfo->options.backwardPropagateNoContract;
+    disableGlPositionOpt = shaderInfo->options.disableGlPositionOpt;
     break;
   }
   case PipelineType::Compute: {
     auto shaderInfo = &(static_cast<const ComputePipelineBuildInfo *>(pipelineContext->getPipelineBuildInfo()))->cs;
     forwardPropagate = forwardPropagate || shaderInfo->options.forwardPropagateNoContract;
     backwardPropagate = backwardPropagate || shaderInfo->options.backwardPropagateNoContract;
+    disableGlPositionOpt = shaderInfo->options.disableGlPositionOpt;
     break;
   }
   case PipelineType::RayTracing: {
@@ -405,6 +417,7 @@ PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisMan
         continue;
       forwardPropagate = forwardPropagate || pipelineInfo->pShaders[i].options.forwardPropagateNoContract;
       backwardPropagate = backwardPropagate || pipelineInfo->pShaders[i].options.backwardPropagateNoContract;
+      disableGlPositionOpt = pipelineInfo->pShaders[i].options.disableGlPositionOpt;
     }
     break;
   }
@@ -416,10 +429,12 @@ PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisMan
     forwardPropagate = ForwardPropagateNoContract;
   if (BackwardPropagateNoContract.getNumOccurrences())
     backwardPropagate = BackwardPropagateNoContract;
+  if (DisableGlPositionOpt.getNumOccurrences())
+    disableGlPositionOpt = DisableGlPositionOpt;
 
   bool adjustedExports = false;
   if (pipelineContext->getPipelineOptions()->enableImplicitInvariantExports)
-    adjustedExports = adjustExports(module);
+    adjustedExports = adjustExports(module, disableGlPositionOpt);
 
   bool propagatedNoContract = false;
   if (forwardPropagate || backwardPropagate)
@@ -436,10 +451,10 @@ PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisMan
 //
 // @param [in/out] module : LLVM module to be run on (empty on entry)
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-PreservedAnalyses SpirvLowerMathFloatOp::run(Module &module, ModuleAnalysisManager &analysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Math-Float-Op\n");
+PreservedAnalyses LowerMathFloatOp::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Math-Float-Op\n");
 
-  SpirvLowerMath::init(module);
+  LowerMath::init(module);
   visit(m_module);
 
   return m_changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
@@ -449,7 +464,7 @@ PreservedAnalyses SpirvLowerMathFloatOp::run(Module &module, ModuleAnalysisManag
 // Visits binary operator instruction.
 //
 // @param binaryOp : Binary operator instruction
-void SpirvLowerMathFloatOp::visitBinaryOperator(BinaryOperator &binaryOp) {
+void LowerMathFloatOp::visitBinaryOperator(BinaryOperator &binaryOp) {
   Instruction::BinaryOps opCode = binaryOp.getOpcode();
 
   auto src1 = binaryOp.getOperand(0);
@@ -536,7 +551,7 @@ void SpirvLowerMathFloatOp::visitBinaryOperator(BinaryOperator &binaryOp) {
 // Visits call instruction.
 //
 // @param callInst : Call instruction
-void SpirvLowerMathFloatOp::visitCallInst(CallInst &callInst) {
+void LowerMathFloatOp::visitCallInst(CallInst &callInst) {
   auto callee = callInst.getCalledFunction();
   if (!callee)
     return;
@@ -551,7 +566,7 @@ void SpirvLowerMathFloatOp::visitCallInst(CallInst &callInst) {
 // Visits fptrunc instruction.
 //
 // @param fptruncInst : Fptrunc instruction
-void SpirvLowerMathFloatOp::visitFPTruncInst(FPTruncInst &fptruncInst) {
+void LowerMathFloatOp::visitFPTruncInst(FPTruncInst &fptruncInst) {
   if (m_fp16RoundToZero) {
     auto src = fptruncInst.getOperand(0);
     auto srcTy = src->getType();
diff --git a/llpc/lower/LowerMath.h b/llpc/lowering/LowerMath.h
similarity index 88%
rename from llpc/lower/LowerMath.h
rename to llpc/lowering/LowerMath.h
index d4c30d8f4c..2c2e702f63 100644
--- a/llpc/lower/LowerMath.h
+++ b/llpc/lowering/LowerMath.h
@@ -30,7 +30,7 @@
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/PassManager.h"
@@ -39,9 +39,9 @@ namespace Llpc {
 
 // =====================================================================================================================
 // SPIR-V lowering operations for math transformation.
-class SpirvLowerMath : public SpirvLower {
+class LowerMath : public SpirvLower {
 public:
-  SpirvLowerMath();
+  LowerMath();
 
 protected:
   void init(llvm::Module &module);
@@ -56,7 +56,7 @@ class SpirvLowerMath : public SpirvLower {
 
 // =====================================================================================================================
 // SPIR-V lowering operations for math constant folding.
-class SpirvLowerMathConstFolding : public SpirvLowerMath, public llvm::PassInfoMixin<SpirvLowerMathConstFolding> {
+class LowerMathConstFolding : public LowerMath, public llvm::PassInfoMixin<LowerMathConstFolding> {
 
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
@@ -70,22 +70,22 @@ class SpirvLowerMathConstFolding : public SpirvLowerMath, public llvm::PassInfoM
 
 // =====================================================================================================================
 // SPIR-V lowering operations to adjust fast math flags.
-class SpirvLowerMathPrecision : public SpirvLower, public llvm::PassInfoMixin<SpirvLowerMathPrecision> {
+class LowerMathPrecision : public SpirvLower, public llvm::PassInfoMixin<LowerMathPrecision> {
 
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
   static llvm::StringRef name() { return "Lower SPIR-V for precision (fast math flags)"; }
 
-  bool adjustExports(llvm::Module &module);
+  bool adjustExports(llvm::Module &module, bool clearAll);
   bool propagateNoContract(llvm::Module &module, bool forward, bool backward);
 };
 
 // =====================================================================================================================
 // SPIR-V lowering operations for math floating point optimisation.
-class SpirvLowerMathFloatOp : public SpirvLowerMath,
-                              public llvm::PassInfoMixin<SpirvLowerMathFloatOp>,
-                              public llvm::InstVisitor<SpirvLowerMathFloatOp> {
+class LowerMathFloatOp : public LowerMath,
+                         public llvm::PassInfoMixin<LowerMathFloatOp>,
+                         public llvm::InstVisitor<LowerMathFloatOp> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/llpc/lower/LowerMemoryOp.cpp b/llpc/lowering/LowerMemoryOp.cpp
similarity index 99%
rename from llpc/lower/LowerMemoryOp.cpp
rename to llpc/lowering/LowerMemoryOp.cpp
index 7b98ea7f02..2a4e3b2799 100644
--- a/llpc/lower/LowerMemoryOp.cpp
+++ b/llpc/lowering/LowerMemoryOp.cpp
@@ -208,6 +208,8 @@ bool LowerMemoryOp::needExpandDynamicIndex(GetElementPtrInst *getElemPtr, unsign
             // NOTE: Normal SPIR-V translation won't generate this, it may come from our internally inserted
             // instructions to do pointer increment.
             allowExpand = false;
+          } else if (indexedTy->isFloatTy()) {
+            allowExpand = false;
           } else {
             llvm_unreachable("Should never be called!");
             allowExpand = false;
diff --git a/llpc/lower/LowerMemoryOp.h b/llpc/lowering/LowerMemoryOp.h
similarity index 99%
rename from llpc/lower/LowerMemoryOp.h
rename to llpc/lowering/LowerMemoryOp.h
index 842f4c1d8b..004f6ebbd3 100644
--- a/llpc/lower/LowerMemoryOp.h
+++ b/llpc/lowering/LowerMemoryOp.h
@@ -30,7 +30,7 @@
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/PassManager.h"
 #include <unordered_set>
diff --git a/llpc/lower/LowerPostInline.cpp b/llpc/lowering/LowerPostInline.cpp
similarity index 99%
rename from llpc/lower/LowerPostInline.cpp
rename to llpc/lowering/LowerPostInline.cpp
index 64400f8193..1646860944 100644
--- a/llpc/lower/LowerPostInline.cpp
+++ b/llpc/lowering/LowerPostInline.cpp
@@ -29,9 +29,9 @@
  ***********************************************************************************************************************
  */
 #include "LowerPostInline.h"
+#include "LoweringUtil.h"
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
-#include "llpcSpirvLowerUtil.h"
 #include "lgc/Builder.h"
 #include "lgc/Pipeline.h"
 #include "llvm/IR/DerivedTypes.h"
diff --git a/llpc/lower/LowerPostInline.h b/llpc/lowering/LowerPostInline.h
similarity index 98%
rename from llpc/lower/LowerPostInline.h
rename to llpc/lowering/LowerPostInline.h
index b3937f2017..1224cab3e6 100644
--- a/llpc/lower/LowerPostInline.h
+++ b/llpc/lowering/LowerPostInline.h
@@ -30,7 +30,7 @@
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/IR/PassManager.h"
 
 namespace Llpc {
diff --git a/llpc/lower/LowerRayTracing.cpp b/llpc/lowering/LowerRayTracing.cpp
similarity index 99%
rename from llpc/lower/LowerRayTracing.cpp
rename to llpc/lowering/LowerRayTracing.cpp
index a9d0800e36..e4fc9a50b9 100644
--- a/llpc/lower/LowerRayTracing.cpp
+++ b/llpc/lowering/LowerRayTracing.cpp
@@ -30,11 +30,11 @@
  */
 
 #include "LowerRayTracing.h"
+#include "LoweringUtil.h"
 #include "SPIRVInternal.h"
 #include "gpurt-compiler.h"
 #include "llpcContext.h"
 #include "llpcRayTracingContext.h"
-#include "llpcSpirvLowerUtil.h"
 #include "compilerutils/CompilerUtils.h"
 #include "llvmraytracing/ContinuationsUtil.h"
 #include "llvmraytracing/GpurtContext.h"
diff --git a/llpc/lower/LowerRayTracing.h b/llpc/lowering/LowerRayTracing.h
similarity index 99%
rename from llpc/lower/LowerRayTracing.h
rename to llpc/lowering/LowerRayTracing.h
index 8eeee82d1b..5a93734527 100644
--- a/llpc/lower/LowerRayTracing.h
+++ b/llpc/lowering/LowerRayTracing.h
@@ -30,8 +30,8 @@
  */
 #pragma once
 
+#include "Lowering.h"
 #include "SPIRVInternal.h"
-#include "llpcSpirvLower.h"
 #include "compilerutils/CompilerUtils.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/IR/PassManager.h"
diff --git a/llpc/lower/LowerTerminator.cpp b/llpc/lowering/LowerTerminator.cpp
similarity index 93%
rename from llpc/lower/LowerTerminator.cpp
rename to llpc/lowering/LowerTerminator.cpp
index 6544d37896..8e88d9c758 100644
--- a/llpc/lower/LowerTerminator.cpp
+++ b/llpc/lowering/LowerTerminator.cpp
@@ -25,24 +25,24 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerTerminator.cpp
- * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerTerminator.
+ * @brief LLPC source file: contains implementation of class Llpc::LowerTerminator.
  * @details This pass removes trailing instructions after known terminators.
  *          These dead instructions can occur when functions calling terminators, such as OpKill, are inlined.
  ***********************************************************************************************************************
  */
 #include "LowerTerminator.h"
+#include "Lowering.h"
+#include "LoweringUtil.h"
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
 #include "llpcDebug.h"
-#include "llpcSpirvLower.h"
-#include "llpcSpirvLowerUtil.h"
 #include "lgc/Builder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
-#define DEBUG_TYPE "llpc-spirv-lower-terminator"
+#define DEBUG_TYPE "lower-terminator"
 
 using namespace llvm;
 using namespace SPIRV;
@@ -55,8 +55,8 @@ namespace Llpc {
 //
 // @param [in/out] module : LLVM module to be run on (empty on entry)
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-PreservedAnalyses SpirvLowerTerminator::run(Module &module, ModuleAnalysisManager &analysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Terminator\n");
+PreservedAnalyses LowerTerminator::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Terminator\n");
 
   SpirvLower::init(&module);
 
@@ -87,7 +87,7 @@ PreservedAnalyses SpirvLowerTerminator::run(Module &module, ModuleAnalysisManage
 // If found, mark dead instructions for removal and add a return immediately following the kill.
 //
 // @param callInst : "Call" instruction
-void SpirvLowerTerminator::visitCallInst(CallInst &callInst) {
+void LowerTerminator::visitCallInst(CallInst &callInst) {
   auto callee = callInst.getCalledFunction();
   if (!callee)
     return;
diff --git a/llpc/lower/LowerTerminator.h b/llpc/lowering/LowerTerminator.h
similarity index 88%
rename from llpc/lower/LowerTerminator.h
rename to llpc/lowering/LowerTerminator.h
index bb472d9768..58e548c94c 100644
--- a/llpc/lower/LowerTerminator.h
+++ b/llpc/lowering/LowerTerminator.h
@@ -25,12 +25,12 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerTerminator.h
- * @brief LLPC header file: contains declaration of Llpc::SpirvLowerTerminator
+ * @brief LLPC header file: contains declaration of Llpc::LowerTerminator
  ***********************************************************************************************************************
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/PassManager.h"
 
@@ -38,9 +38,9 @@ namespace Llpc {
 
 // =====================================================================================================================
 // Represents the pass of SPIR-V lowering terminators.
-class SpirvLowerTerminator : public SpirvLower,
-                             public llvm::PassInfoMixin<SpirvLowerTerminator>,
-                             public llvm::InstVisitor<SpirvLowerTerminator> {
+class LowerTerminator : public SpirvLower,
+                        public llvm::PassInfoMixin<LowerTerminator>,
+                        public llvm::InstVisitor<LowerTerminator> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/llpc/lower/LowerTranslator.cpp b/llpc/lowering/LowerTranslator.cpp
similarity index 92%
rename from llpc/lower/LowerTranslator.cpp
rename to llpc/lowering/LowerTranslator.cpp
index 0c86246823..657b630156 100644
--- a/llpc/lower/LowerTranslator.cpp
+++ b/llpc/lowering/LowerTranslator.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerTranslator.cpp
- * @brief LLPC source file: contains implementation of Llpc::SpirvLowerTranslator
+ * @brief LLPC source file: contains implementation of Llpc::LowerTranslator
  ***********************************************************************************************************************
  */
 #include "LowerTranslator.h"
@@ -36,7 +36,7 @@
 #include <sstream>
 #include <string>
 
-#define DEBUG_TYPE "llpc-spirv-lower-translator"
+#define DEBUG_TYPE "lower-translator"
 
 using namespace llvm;
 using namespace Llpc;
@@ -46,8 +46,8 @@ using namespace Llpc;
 //
 // @param [in/out] module : LLVM module to be run on (empty on entry)
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-llvm::PreservedAnalyses SpirvLowerTranslator::run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Translator\n");
+llvm::PreservedAnalyses LowerTranslator::run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Translator\n");
 
   SpirvLower::init(&module);
 
@@ -67,7 +67,7 @@ llvm::PreservedAnalyses SpirvLowerTranslator::run(llvm::Module &module, llvm::Mo
 //
 // @param shaderInfo : Specialization info
 // @param [in/out] module : Module to translate into, initially empty
-void SpirvLowerTranslator::translateSpirvToLlvm(const PipelineShaderInfo *shaderInfo, Module *module) {
+void LowerTranslator::translateSpirvToLlvm(const PipelineShaderInfo *shaderInfo, Module *module) {
   BinaryData optimizedSpirvBin = {};
   const ShaderModuleData *moduleData = reinterpret_cast<const ShaderModuleData *>(shaderInfo->pModuleData);
   assert(moduleData->binType == BinaryType::Spirv);
diff --git a/llpc/lower/LowerTranslator.h b/llpc/lowering/LowerTranslator.h
similarity index 87%
rename from llpc/lower/LowerTranslator.h
rename to llpc/lowering/LowerTranslator.h
index 612e150376..86416d21aa 100644
--- a/llpc/lower/LowerTranslator.h
+++ b/llpc/lowering/LowerTranslator.h
@@ -25,26 +25,26 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerTranslator.h
- * @brief LLPC header file: contains declaration of Llpc::SpirvLowerTranslator
+ * @brief LLPC header file: contains declaration of Llpc::LowerTranslator
  ***********************************************************************************************************************
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/IR/PassManager.h"
 
 namespace Llpc {
 
 // =====================================================================================================================
 // Pass to translate the SPIR-V modules and generate an IR module for the whole pipeline
-class SpirvLowerTranslator : public SpirvLower, public llvm::PassInfoMixin<SpirvLowerTranslator> {
+class LowerTranslator : public SpirvLower, public llvm::PassInfoMixin<LowerTranslator> {
 public:
-  SpirvLowerTranslator() {}
+  LowerTranslator() {}
 
   //
   // @param stage : Shader stage
   // @param shaderInfo : Shader info for this shader
-  SpirvLowerTranslator(ShaderStage stage, const PipelineShaderInfo *shaderInfo, llvm::StringRef globalVarPrefix = {})
+  LowerTranslator(ShaderStage stage, const PipelineShaderInfo *shaderInfo, llvm::StringRef globalVarPrefix = {})
       : m_shaderInfo(shaderInfo), m_globalVarPrefix(globalVarPrefix) {}
 
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
@@ -57,7 +57,7 @@ class SpirvLowerTranslator : public SpirvLower, public llvm::PassInfoMixin<Spirv
   // -----------------------------------------------------------------------------------------------------------------
 
   const PipelineShaderInfo *m_shaderInfo; // Input shader info
-  std::string m_globalVarPrefix;
+  llvm::StringRef m_globalVarPrefix;
 };
 
 } // namespace Llpc
diff --git a/llpc/lower/llpcSpirvLower.cpp b/llpc/lowering/Lowering.cpp
similarity index 94%
rename from llpc/lower/llpcSpirvLower.cpp
rename to llpc/lowering/Lowering.cpp
index e8e61881f8..05f0e6400a 100644
--- a/llpc/lower/llpcSpirvLower.cpp
+++ b/llpc/lowering/Lowering.cpp
@@ -24,16 +24,16 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  llpcSpirvLower.cpp
+ * @file  Lowering.cpp
  * @brief LLPC source file: contains implementation of class Llpc::SpirvLower.
  ***********************************************************************************************************************
  */
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "LowerAccessChain.h"
 #include "LowerCfgMerges.h"
 #include "LowerConstImmediateStore.h"
 #include "LowerCooperativeMatrix.h"
-#include "LowerGLCompatibility.h"
+#include "LowerGlCompatibility.h"
 #include "LowerGlobals.h"
 #include "LowerInstMetaRemove.h"
 #include "LowerMath.h"
@@ -42,11 +42,12 @@
 #include "LowerRayTracing.h"
 #include "LowerTerminator.h"
 #include "LowerTranslator.h"
+#include "LoweringUtil.h"
 #include "ProcessGpuRtLibrary.h"
+#include "ScalarReplacementOfBuiltins.h"
 #include "llpcContext.h"
 #include "llpcDebug.h"
 #include "llpcRayTracingContext.h"
-#include "llpcSpirvLowerUtil.h"
 #include "lgc/Builder.h"
 #include "lgc/LgcContext.h"
 #include "lgc/PassManager.h"
@@ -124,22 +125,25 @@ void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager
     passMgr.addPass(LowerPostInline());
 
   // Lower SPIR-V terminators
-  passMgr.addPass(SpirvLowerTerminator());
+  passMgr.addPass(LowerTerminator());
 
   // Lower spirv.cooperative.matrix.proxy to LGC operations. Should run before SROA.
-  passMgr.addPass(SpirvLowerCooperativeMatrix());
+  passMgr.addPass(LowerCooperativeMatrixProxy());
+
+  // Split up and replace global variables that are structs of builtins.
+  passMgr.addPass(ScalarReplacementOfBuiltins());
 
   // Lower Glsl compatibility variables and operations
-  passMgr.addPass(LowerGLCompatibility());
+  passMgr.addPass(LowerGlCompatibility());
 
   // Lower SPIR-V global variables, inputs, and outputs
   passMgr.addPass(LowerGlobals());
 
   // Lower SPIR-V constant immediate store.
-  passMgr.addPass(SpirvLowerConstImmediateStore());
+  passMgr.addPass(LowerConstImmediateStore());
 
   // Lower SPIR-V constant folding - must be done before instruction combining pass.
-  passMgr.addPass(SpirvLowerMathConstFolding());
+  passMgr.addPass(LowerMathConstFolding());
 
   // Lower SPIR-V memory operations
   passMgr.addPass(LowerMemoryOp());
@@ -157,7 +161,7 @@ void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager
   // Lower SPIR-V precision / adjust fast math flags.
   // Must be done before instruction combining pass to prevent incorrect contractions.
   // Should be after SROA to avoid having to track values through memory load/store.
-  passMgr.addPass(SpirvLowerMathPrecision());
+  passMgr.addPass(LowerMathPrecision());
 
   passMgr.addPass(GlobalOptPass());
   passMgr.addPass(createModuleToFunctionPassAdaptor(ADCEPass()));
@@ -174,10 +178,10 @@ void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager
   passMgr.addPass(createModuleToFunctionPassAdaptor(EarlyCSEPass()));
 
   // Lower SPIR-V floating point optimisation
-  passMgr.addPass(SpirvLowerMathFloatOp());
+  passMgr.addPass(LowerMathFloatOp());
 
   // Lower SPIR-V instruction metadata remove
-  passMgr.addPass(SpirvLowerInstMetaRemove());
+  passMgr.addPass(LowerInstMetaRemove());
 
   // Lower SPIR-V ray tracing related stuff, including entry point generation, lgc.rt dialect handling, some of
   // lgc.gpurt dialect handling.
@@ -215,7 +219,7 @@ void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager
 //
 // @param [in/out] passMgr : Pass manager
 void SpirvLower::registerTranslationPasses(lgc::PassManager &passMgr) {
-  passMgr.registerPass("llpc-spirv-lower-translator", SpirvLowerTranslator::name());
+  passMgr.registerPass("lower-translator", LowerTranslator::name());
   passMgr.registerPass("lower-gpurt-library", ProcessGpuRtLibrary::name());
 }
 
diff --git a/llpc/lower/llpcSpirvLower.h b/llpc/lowering/Lowering.h
similarity index 99%
rename from llpc/lower/llpcSpirvLower.h
rename to llpc/lowering/Lowering.h
index f1a788c761..22cb7ae4a4 100644
--- a/llpc/lower/llpcSpirvLower.h
+++ b/llpc/lowering/Lowering.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  llpcSpirvLower.h
+ * @file  Lowering.h
  * @brief LLPC header file: contains declaration of class Llpc::SpirvLower.
  ***********************************************************************************************************************
  */
diff --git a/llpc/lower/llpcSpirvLowerUtil.cpp b/llpc/lowering/LoweringUtil.cpp
similarity index 98%
rename from llpc/lower/llpcSpirvLowerUtil.cpp
rename to llpc/lowering/LoweringUtil.cpp
index 037e85b76d..4ab23c0ed2 100644
--- a/llpc/lower/llpcSpirvLowerUtil.cpp
+++ b/llpc/lowering/LoweringUtil.cpp
@@ -24,14 +24,14 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  llpcSpirvLowerUtil.cpp
+ * @file  LoweringUtil.cpp
  * @brief LLPC source file: utilities for use by LLPC front-end
  ***********************************************************************************************************************
  */
 
-#include "llpcSpirvLowerUtil.h"
+#include "LoweringUtil.h"
+#include "Lowering.h"
 #include "SPIRVInternal.h"
-#include "llpcSpirvLower.h"
 #include "llpcUtil.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
diff --git a/llpc/lower/llpcSpirvLowerUtil.h b/llpc/lowering/LoweringUtil.h
similarity index 99%
rename from llpc/lower/llpcSpirvLowerUtil.h
rename to llpc/lowering/LoweringUtil.h
index 4e05b016de..a4411fd6f6 100644
--- a/llpc/lower/llpcSpirvLowerUtil.h
+++ b/llpc/lowering/LoweringUtil.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  llpcSpirvLowerUtil.h
+ * @file  LoweringUtil.h
  * @brief LLPC header file: utilities for use by LLPC front-end
  ***********************************************************************************************************************
  */
diff --git a/llpc/lower/PassRegistry.inc b/llpc/lowering/PassRegistry.inc
similarity index 80%
rename from llpc/lower/PassRegistry.inc
rename to llpc/lowering/PassRegistry.inc
index 73cf809b15..081f0beed7 100644
--- a/llpc/lower/PassRegistry.inc
+++ b/llpc/lowering/PassRegistry.inc
@@ -41,17 +41,18 @@ LLPC_MODULE_PASS("adce", ADCEPass)
 LLPC_MODULE_PASS("instcombine", InstCombinePass)
 LLPC_MODULE_PASS("simplifycfg", SimplifyCFGPass)
 LLPC_MODULE_PASS("early-cse", EarlyCSEPass)
-LLPC_MODULE_PASS("llpc-spirv-lower-gl-compatibility", LowerGLCompatibility)
+LLPC_MODULE_PASS("scalar-replacement-of-builtins", ScalarReplacementOfBuiltins)
+LLPC_MODULE_PASS("lower-gl-compatibility", LowerGlCompatibility)
 LLPC_MODULE_PASS("lower-access-chain", LowerAccessChain)
 LLPC_MODULE_PASS("lower-cfg-merges", LowerCfgMerges)
-LLPC_MODULE_PASS("llpc-spirv-lower-const-immediate-store", SpirvLowerConstImmediateStore)
-LLPC_MODULE_PASS("llpc-spirv-lower-cooperative-matrix", SpirvLowerCooperativeMatrix)
-LLPC_MODULE_PASS("llpc-spirv-lower-inst-meta-remove", SpirvLowerInstMetaRemove)
-LLPC_MODULE_PASS("llpc-spirv-lower-terminator", SpirvLowerTerminator)
+LLPC_MODULE_PASS("lower-const-immediate-store", LowerConstImmediateStore)
+LLPC_MODULE_PASS("lower-cooperative-matrix", LowerCooperativeMatrixProxy)
+LLPC_MODULE_PASS("lower-inst-meta-remove", LowerInstMetaRemove)
+LLPC_MODULE_PASS("lower-terminator", LowerTerminator)
 LLPC_MODULE_PASS("lower-globals", LowerGlobals)
-LLPC_MODULE_PASS("llpc-spirv-lower-math-const-folding", SpirvLowerMathConstFolding)
-LLPC_MODULE_PASS("llpc-spirv-lower-math-precision", SpirvLowerMathPrecision)
-LLPC_MODULE_PASS("llpc-spirv-lower-math-float-op", SpirvLowerMathFloatOp)
+LLPC_MODULE_PASS("lower-math-const-folding", LowerMathConstFolding)
+LLPC_MODULE_PASS("lower-math-precision", LowerMathPrecision)
+LLPC_MODULE_PASS("lower-math-float-op", LowerMathFloatOp)
 LLPC_MODULE_PASS("lower-memory-op", LowerMemoryOp)
 LLPC_MODULE_PASS("llpc-spirv-lower-ray-tracing", SpirvLowerRayTracing)
 LLPC_MODULE_PASS("lower-post-inline", LowerPostInline)
diff --git a/llpc/lower/PrepareContinuations.cpp b/llpc/lowering/PrepareContinuations.cpp
similarity index 100%
rename from llpc/lower/PrepareContinuations.cpp
rename to llpc/lowering/PrepareContinuations.cpp
diff --git a/llpc/lower/PrepareContinuations.h b/llpc/lowering/PrepareContinuations.h
similarity index 98%
rename from llpc/lower/PrepareContinuations.h
rename to llpc/lowering/PrepareContinuations.h
index 9df5e60f46..078301c8ec 100644
--- a/llpc/lower/PrepareContinuations.h
+++ b/llpc/lowering/PrepareContinuations.h
@@ -32,7 +32,7 @@
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/IR/PassManager.h"
 
 namespace Llpc {
diff --git a/llpc/lower/ProcessGfxRuntimeLibrary.cpp b/llpc/lowering/ProcessGfxRuntimeLibrary.cpp
similarity index 99%
rename from llpc/lower/ProcessGfxRuntimeLibrary.cpp
rename to llpc/lowering/ProcessGfxRuntimeLibrary.cpp
index 1fad88f7e4..8df9276b98 100644
--- a/llpc/lower/ProcessGfxRuntimeLibrary.cpp
+++ b/llpc/lowering/ProcessGfxRuntimeLibrary.cpp
@@ -30,7 +30,7 @@
  */
 #include "ProcessGfxRuntimeLibrary.h"
 #include "LowerInternalLibraryIntrinsic.h"
-#include "llpcSpirvLowerUtil.h"
+#include "LoweringUtil.h"
 #include "compilerutils/ArgPromotion.h"
 #include "compilerutils/TypesMetadata.h"
 #include "lgc/Builder.h"
diff --git a/llpc/lower/ProcessGfxRuntimeLibrary.h b/llpc/lowering/ProcessGfxRuntimeLibrary.h
similarity index 99%
rename from llpc/lower/ProcessGfxRuntimeLibrary.h
rename to llpc/lowering/ProcessGfxRuntimeLibrary.h
index 91322eb570..1262602a8b 100644
--- a/llpc/lower/ProcessGfxRuntimeLibrary.h
+++ b/llpc/lowering/ProcessGfxRuntimeLibrary.h
@@ -30,7 +30,7 @@
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
+#include "Lowering.h"
 #include "llvm/IR/PassManager.h"
 
 namespace Llpc {
diff --git a/llpc/lower/ProcessGpuRtLibrary.cpp b/llpc/lowering/ProcessGpuRtLibrary.cpp
similarity index 95%
rename from llpc/lower/ProcessGpuRtLibrary.cpp
rename to llpc/lowering/ProcessGpuRtLibrary.cpp
index 66a6f33cfc..48dfb81d01 100644
--- a/llpc/lower/ProcessGpuRtLibrary.cpp
+++ b/llpc/lowering/ProcessGpuRtLibrary.cpp
@@ -30,10 +30,10 @@
  */
 #include "ProcessGpuRtLibrary.h"
 #include "LowerInternalLibraryIntrinsic.h"
+#include "LoweringUtil.h"
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
 #include "llpcRayTracingContext.h"
-#include "llpcSpirvLowerUtil.h"
 #include "compilerutils/ArgPromotion.h"
 #include "compilerutils/CompilerUtils.h"
 #include "compilerutils/TypesMetadata.h"
@@ -67,8 +67,6 @@ PreservedAnalyses ProcessGpuRtLibrary::run(Module &module, ModuleAnalysisManager
 
   // Process each function.
   SmallVector<std::pair<Function *, SmallBitVector>> argPromotionsFuncs;
-  auto rtipVersion = m_context->getPipelineContext()->getRayTracingState()->rtIpVersion;
-  unsigned rtip = rtipVersion.major * 10 + rtipVersion.minor;
   SmallVector<Function *> maybeRtFuncs;
   for (Function &func : module) {
     if (func.isDeclaration() || !func.hasName())
@@ -83,11 +81,11 @@ PreservedAnalyses ProcessGpuRtLibrary::run(Module &module, ModuleAnalysisManager
     StringRef funcName = func.getName();
     SmallBitVector argPromotions(/*size=*/8);
     bool isRqFunc = false;
-    if (funcName.starts_with("TraceRayInline"))
+    if (funcName.starts_with("_RayQuery_TraceRayInline"))
       argPromotions.set(1, 8);
-    else if (funcName.starts_with("RayQueryProceed"))
+    else if (funcName.starts_with("_RayQuery_Proceed"))
       argPromotions.set(1, 3);
-    else if (funcName.starts_with("FetchTrianglePositionFromRayQuery"))
+    else if (funcName.starts_with("_RayQuery_FetchTrianglePosition"))
       argPromotions.set(1);
     else {
       StringRef rqFuncName = funcName;
@@ -102,40 +100,6 @@ PreservedAnalyses ProcessGpuRtLibrary::run(Module &module, ModuleAnalysisManager
       maybeRtFuncs.push_back(&func);
       continue;
     }
-
-    // This is a rayQuery function, and we have the args requiring promotion in the argPromotions bit vector.
-    // Parse off the RTIP suffix if any, e.g. "2_0", into a two-digit decimal number, e.g. 20.
-    // Ignore BVH8 funcs.
-    if (funcName.ends_with("BVH8"))
-      continue;
-    StringRef funcSuffix = funcName.take_back(3);
-    unsigned funcRtip = 0;
-    if (funcSuffix.size() == 3 && isdigit(funcSuffix[0]) && funcSuffix[1] == '_' && isdigit(funcSuffix[2])) {
-      funcRtip = (funcSuffix[0] - '0') * 10 + (funcSuffix[2] - '0');
-      funcName = funcName.drop_back(funcSuffix.size());
-    }
-    // If this function has an RTIP suffix but it is wrong, ignore it (leaving it as internal linkage so it gets
-    // removed later).
-    if (funcRtip != 0 && funcRtip != rtip)
-      continue;
-
-    if (funcRtip != 0) {
-      // We have a function with the correct RTIP suffix. We want to rename it without the RTIP suffix.
-      // If there is another function of the same name without the RTIP suffix, take its name and make the
-      // other function internal so it gets removed later. (This works whether we saw that function first or
-      // this RTIP-suffixed one.)
-      if (Function *otherFunc = module.getFunction(funcName)) {
-        otherFunc->setLinkage(GlobalValue::InternalLinkage);
-        func.takeName(otherFunc);
-      } else {
-        // No other function. Set name the normal way. Note use of str() to copy the unsuffixed name out
-        // before setName() frees it.
-        func.setName(funcName.str());
-      }
-    }
-    // Set external linkage on this function.
-    func.setLinkage(GlobalValue::WeakAnyLinkage);
-
     if (argPromotions.any()) {
       // Add this function to the list that need arg promotion.
       // We don't do the arg promotion here as it invalidates the module iterator.
@@ -151,8 +115,7 @@ PreservedAnalyses ProcessGpuRtLibrary::run(Module &module, ModuleAnalysisManager
     Function *func = argPromotionsFunc.first;
     if (func->getLinkage() == GlobalValue::InternalLinkage)
       continue;
-    Function *promotedFunc = CompilerUtils::promotePointerArguments(func, argPromotionsFunc.second);
-    promotedFunc->setLinkage(GlobalValue::WeakAnyLinkage);
+    CompilerUtils::promotePointerArguments(func, argPromotionsFunc.second);
   }
 
   // Process ray-tracing (i.e. non-rayQuery) functions in a separate loop; processLibraryFunction() may do
diff --git a/llpc/lower/ProcessGpuRtLibrary.h b/llpc/lowering/ProcessGpuRtLibrary.h
similarity index 99%
rename from llpc/lower/ProcessGpuRtLibrary.h
rename to llpc/lowering/ProcessGpuRtLibrary.h
index 30fe5a5ca5..6f4ceb1aa6 100644
--- a/llpc/lower/ProcessGpuRtLibrary.h
+++ b/llpc/lowering/ProcessGpuRtLibrary.h
@@ -29,8 +29,8 @@
  ***********************************************************************************************************************
  */
 #pragma once
+#include "Lowering.h"
 #include "SPIRVInternal.h"
-#include "llpcSpirvLower.h"
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/IR/PassManager.h"
 
diff --git a/llpc/lowering/ScalarReplacementOfBuiltins.cpp b/llpc/lowering/ScalarReplacementOfBuiltins.cpp
new file mode 100644
index 0000000000..deb8b0ab0f
--- /dev/null
+++ b/llpc/lowering/ScalarReplacementOfBuiltins.cpp
@@ -0,0 +1,442 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  ScalarReplacementOfBuiltins.cpp
+ * @brief LLPC source file: split and replace global variables that are structures containing built-in values
+ ***********************************************************************************************************************
+ */
+#include "ScalarReplacementOfBuiltins.h"
+#include "SPIRVInternal.h"
+#include "llpcContext.h"
+#include "vkgcDefs.h"
+#include "spirv/spirv.hpp"
+#include "lgc/Builder.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+
+#define DEBUG_TYPE "scalar-replacement-of-builtins"
+
+using namespace llvm;
+using namespace lgc;
+using namespace Llpc;
+
+namespace Llpc {
+
+// =====================================================================================================================
+// Executes this SPIR-V lowering pass on the specified LLVM module.
+//
+// @param [in/out] module : LLVM module to be run on
+// @param [in/out] analysisManager : Analysis manager to use for this transformation
+PreservedAnalyses ScalarReplacementOfBuiltins::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(
+      dbgs() << "Run the pass refactor and replace global variables that are structures containing built-in values\n");
+
+  bool changed = false;
+  SpirvLower::init(&module);
+  SmallVector<GlobalVariable *> originalGlobals(make_pointer_range(m_module->globals()));
+  for (auto &global : originalGlobals) {
+    if (!needsSplit(global))
+      continue;
+
+    // TODO: Handle the case where globalBuiltinVar is gl_in or gl_MeshVerticesEXT.
+    if (global->getValueType()->isStructTy()) {
+      splitBuiltinStructure(global);
+      changed = true;
+    } else if (global->getValueType()->isArrayTy()) {
+      splitBuiltinArray(global);
+      changed = true;
+    }
+  }
+  return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+// =====================================================================================================================
+// Retrieves metadata for shader input/output elements based on their type.
+//
+// @param elementType : Type of the shader input/output element
+// @param elementMetadata : Metadata values for initializing the metadata structure
+ShaderInOutMetadata ScalarReplacementOfBuiltins::getShaderInOutMetadata(Type *elementType, Constant *elementMetadata) {
+  ShaderInOutMetadata inOutMeta = {};
+  if (elementType->isArrayTy()) {
+    assert(elementMetadata->getNumOperands() == 4);
+    inOutMeta.U64All[0] = cast<ConstantInt>(elementMetadata->getOperand(2))->getZExtValue();
+    inOutMeta.U64All[1] = cast<ConstantInt>(elementMetadata->getOperand(3))->getZExtValue();
+  } else {
+    assert(elementMetadata->getNumOperands() == 2);
+    inOutMeta.U64All[0] = cast<ConstantInt>(elementMetadata->getOperand(0))->getZExtValue();
+    inOutMeta.U64All[1] = cast<ConstantInt>(elementMetadata->getOperand(1))->getZExtValue();
+  }
+  return inOutMeta;
+}
+
+// =====================================================================================================================
+// Determine whether the structure needs to be split.
+//
+// @param globalBuiltinVar : Global variable containing built-in type
+bool ScalarReplacementOfBuiltins::needsSplit(GlobalVariable *globalBuiltinVar) {
+  auto addressSpace = globalBuiltinVar->getType()->getAddressSpace();
+  if (addressSpace != SPIRV::SPIRAS_Output)
+    return false;
+
+  Type *valueType = globalBuiltinVar->getValueType();
+  // NOTE: If the global value type to be split is a structure or array.
+  if (!valueType->isStructTy() && !valueType->isArrayTy())
+    return false;
+
+  MDNode *globalVarMetaNode = globalBuiltinVar->getMetadata(gSPIRVMD::InOut);
+  Constant *inOutMetaConst = mdconst::dyn_extract<Constant>(globalVarMetaNode->getOperand(0));
+  Constant *firstMemberMeta = nullptr;
+  Type *firstMemberTy = nullptr;
+
+  if (valueType->isArrayTy()) {
+    Type *arrayElemmentTy = valueType->getArrayElementType();
+    // Note: If the global value type to be split is an array, the member type must be a structure type.
+    // This is because, according to OpenGL specifications, members of gl_in, gl_out, and gl_MeshVerticesEXT must be of
+    // structure type.
+    if (!arrayElemmentTy->isStructTy())
+      return false;
+
+    Constant *structureMds = dyn_cast<Constant>(inOutMetaConst->getOperand(1));
+
+    firstMemberTy = arrayElemmentTy->getStructElementType(0);
+    firstMemberMeta = dyn_cast<Constant>(structureMds->getOperand(0));
+  } else if (globalBuiltinVar->getValueType()->isStructTy()) {
+    // NOTE: If the global value type to be split is a structure, the first member of the structure must be a built-in
+    // value or a location type for compatibility variables. Only such structures can be split.
+    Type *globalBuiltinVarTy = globalBuiltinVar->getValueType();
+    assert(globalBuiltinVarTy->isStructTy());
+
+    firstMemberTy = globalBuiltinVarTy->getStructElementType(0);
+    firstMemberMeta = cast<Constant>(inOutMetaConst->getOperand(0));
+  }
+
+  // NOTE: If the first member is of structure type, we do not need to split it because gl_in, gl_out, or gl_PerVertex
+  // do not have any members that are of structure type.
+  if (firstMemberTy->isStructTy())
+    return false;
+  ShaderInOutMetadata firstMeta = getShaderInOutMetadata(firstMemberTy, firstMemberMeta);
+  // Note: This condition handles only built-in and location value types.
+  assert(firstMeta.IsBuiltIn || firstMeta.IsLoc);
+  unsigned builtInId = firstMeta.Value;
+  if (firstMeta.IsBuiltIn) {
+    switch (builtInId) {
+    case spv::BuiltInPosition:
+    case spv::BuiltInPointSize:
+    case spv::BuiltInClipDistance:
+    case spv::BuiltInCullDistance:
+      return true;
+    default:
+      return false;
+    }
+  } else {
+    switch (builtInId) {
+    case Vkgc::GlCompatibilityInOutLocation::ClipVertex:
+    case Vkgc::GlCompatibilityInOutLocation::FrontColor:
+    case Vkgc::GlCompatibilityInOutLocation::BackColor:
+    case Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor:
+    case Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor:
+    case Vkgc::GlCompatibilityInOutLocation::TexCoord:
+    case Vkgc::GlCompatibilityInOutLocation::FogFragCoord:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  return false;
+}
+
+// =====================================================================================================================
+// Resolves the name of a built-in shader element based on its metadata.
+//
+// @param inOutMeta : Reference to the metadata structure describing the shader element
+// @returns : The resolved name of the built-in shader element as a StringRef
+StringRef ScalarReplacementOfBuiltins::getBuiltinElementName(ShaderInOutMetadata &inOutMeta) {
+  StringRef builtinElementName;
+  unsigned builtInId = inOutMeta.Value;
+  if (inOutMeta.IsBuiltIn) {
+    switch (builtInId) {
+    case spv::BuiltInPosition:
+      builtinElementName = "_gl_Position";
+      break;
+    case spv::BuiltInPointSize:
+      builtinElementName = "_gl_PointSize";
+      break;
+    case spv::BuiltInClipDistance:
+      builtinElementName = "_gl_ClipDistance";
+      break;
+    case spv::BuiltInCullDistance:
+      builtinElementName = "_gl_CullDistance";
+      break;
+    default:
+      llvm_unreachable("Not implemented");
+      break;
+    }
+  } else {
+    switch (builtInId) {
+    case Vkgc::GlCompatibilityInOutLocation::ClipVertex:
+      builtinElementName = "_gl_ClipVertex";
+      break;
+    case Vkgc::GlCompatibilityInOutLocation::FrontColor:
+      builtinElementName = "_gl_FrontColor";
+      break;
+    case Vkgc::GlCompatibilityInOutLocation::BackColor:
+      builtinElementName = "_gl_BackColor";
+      break;
+    case Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor:
+      builtinElementName = "_gl_FrontSecondaryColor";
+      break;
+    case Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor:
+      builtinElementName = "_gl_BackSecondaryColor";
+      break;
+    case Vkgc::GlCompatibilityInOutLocation::TexCoord:
+      builtinElementName = "_gl_TexCoord";
+      break;
+    case Vkgc::GlCompatibilityInOutLocation::FogFragCoord:
+      builtinElementName = "_gl_FogFragCoord";
+      break;
+    default:
+      llvm_unreachable("Not implemented");
+      break;
+    }
+  }
+  return builtinElementName;
+}
+
+// =====================================================================================================================
+// Removes unused newly created built-in global variables.
+//
+// @param elements : Vector of users associated with newly created global variables
+void ScalarReplacementOfBuiltins::cleanUpUnusedGlobals(SmallVector<User *> &elements) {
+  for (User *user : make_early_inc_range(elements)) {
+    GlobalVariable *globalValueReplace = cast<GlobalVariable>(user);
+    if (globalValueReplace->users().empty()) {
+      globalValueReplace->dropAllReferences();
+      globalValueReplace->eraseFromParent();
+    }
+  }
+  return;
+}
+
+// =====================================================================================================================
+// Replaces users of a global variable with newly created global variables.
+//
+// @param globalBuiltinVar : Global variable containing built-in type
+// @param elements : Vector of users associated with newly created global variables
+void ScalarReplacementOfBuiltins::replaceGlobalBuiltinVar(GlobalVariable *globalBuiltinVar,
+                                                          SmallVector<User *> &elements) {
+  convertUsersOfConstantsToInstructions(globalBuiltinVar);
+  for (User *user : make_early_inc_range(globalBuiltinVar->users())) {
+    if (StoreInst *storeInst = dyn_cast<StoreInst>(user)) {
+      [[maybe_unused]] const DataLayout &dataLayout = storeInst->getModule()->getDataLayout();
+      GlobalVariable *globalVar = cast<GlobalVariable>(elements[0]);
+      assert(dataLayout.getTypeStoreSize(storeInst->getValueOperand()->getType()) <=
+             dataLayout.getTypeStoreSize(globalVar->getValueType()));
+      storeInst->replaceUsesOfWith(globalBuiltinVar, globalVar);
+    } else if (LoadInst *loadInst = dyn_cast<LoadInst>(user)) {
+      GlobalVariable *LoadValue = cast<GlobalVariable>(elements[0]);
+      loadInst->replaceUsesOfWith(globalBuiltinVar, LoadValue);
+    } else if (auto *gepInst = dyn_cast<GetElementPtrInst>(user)) {
+      SmallVector<Value *, 8> indices;
+      GlobalVariable *globalValueReplace = nullptr;
+      Type *globalValueReplaceTy = nullptr;
+      unsigned index = UINT_MAX;
+
+      if (globalBuiltinVar->getValueType()->isStructTy()) {
+        // Note: The newly generated global variables are created based on the elements of the original global structure
+        // variable. Therefore, when encountering a GetElementPtr (GEP) instruction, we utilize the second operand to
+        // determine which of the newly generated global variables corresponds to a specific element in the original
+        // structure.
+        // Example:
+        // GEP Instruction: getelementptr ({ <4 x float>, float... }, ptr addrspace(65) @0, i32 0, i32 4)
+        // Here, `gepInst->idx_begin() + 1` retrieves the index to access the fourth element of the
+        // original structure (0-indexed), which corresponds to the fourth newly created global variable.
+        // This allows matching the GEP indices with the corresponding split global variables.
+        index = cast<ConstantInt>(gepInst->idx_begin() + 1)->getZExtValue();
+        indices.push_back(*(gepInst->idx_begin()));
+        unsigned int numIndices = gepInst->getNumIndices();
+        if (numIndices >= 3)
+          indices.append(gepInst->idx_begin() + 2, gepInst->idx_end());
+        assert(cast<ConstantInt>(indices[0])->isZero() && "Non-zero GEP first index\n");
+      } else if (globalBuiltinVar->getValueType()->isArrayTy()) {
+        // Note: The newly generated global variables are derived from the elements of the original array.
+        // When processing a GetElementPtr (GEP) instruction that navigates through such an array, the third operand
+        // (after the base pointer and the initial index which is typically zero) indicates the specific element
+        // in the array that is being accessed.
+        // Example:
+        // GEP Instruction: getelementptr [3 x { <4 x float>, ... }], ptr addrspace(65) @gl_out, i32 0, i32 %5, i32 4
+        // In this example, `gepInst->idx_begin() + 2` corresponds to `i32 4`, which is used to access the fourth
+        // element of the array (0-indexed). This element index is used to determine the appropriate newly created
+        // global variable that corresponds to this element in the original array structure. This indexing helps in
+        // directly mapping the GEP instruction indices to the split global variables.
+        index = cast<ConstantInt>(gepInst->idx_begin() + 2)->getZExtValue();
+        for (auto it = gepInst->idx_begin(); it != gepInst->idx_end(); ++it) {
+          if (it - gepInst->idx_begin() == 2)
+            continue;
+          indices.push_back(*it);
+        }
+      } else {
+        llvm_unreachable("Not implemented");
+      }
+
+      globalValueReplace = cast<GlobalVariable>(elements[index]);
+      globalValueReplaceTy = globalValueReplace->getValueType();
+      m_builder->SetInsertPoint(gepInst);
+      Value *gepElement =
+          m_builder->CreateGEP(globalValueReplaceTy, elements[index], indices, "",
+                               gepInst->isInBounds() ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none());
+      gepInst->replaceAllUsesWith(gepElement);
+      gepInst->eraseFromParent();
+    } else {
+      llvm_unreachable("Not implemented");
+    }
+  }
+  return;
+}
+
+// =====================================================================================================================
+// Splits a global variable of structure type containing built-in elements into individual components.
+//
+// @param globalBuiltinVar : Global variable containing built-in type
+void ScalarReplacementOfBuiltins::splitBuiltinStructure(GlobalVariable *globalBuiltinVar) {
+  SmallVector<User *> elements;
+  StringRef prefixName = globalBuiltinVar->getName();
+  MDNode *metaNode = globalBuiltinVar->getMetadata(gSPIRVMD::InOut);
+  assert(metaNode);
+  Constant *inOutMetaConst = mdconst::extract<Constant>(metaNode->getOperand(0));
+  Type *globalBuiltinVarTy = globalBuiltinVar->getValueType();
+  assert(globalBuiltinVarTy->isStructTy());
+  auto structElementCount = globalBuiltinVarTy->getStructNumElements();
+  assert(structElementCount == inOutMetaConst->getType()->getStructNumElements());
+
+  for (unsigned idx = 0; idx < structElementCount; ++idx) {
+    Type *elementType = globalBuiltinVarTy->getStructElementType(idx);
+    Constant *elementMetadata = cast<Constant>(inOutMetaConst->getOperand(idx));
+    ShaderInOutMetadata inOutMeta = getShaderInOutMetadata(elementType, elementMetadata);
+
+    // Note: This condition handles only built-in and location value types.
+    assert(inOutMeta.IsBuiltIn || inOutMeta.IsLoc);
+    StringRef builtinElementName = getBuiltinElementName(inOutMeta);
+    GlobalVariable *replacementBuiltinVar = new GlobalVariable(
+        *m_module, elementType, false, GlobalValue::ExternalLinkage, nullptr, prefixName + builtinElementName, nullptr,
+        GlobalVariable::NotThreadLocal, SPIRV::SPIRAS_Output);
+
+    replacementBuiltinVar->addMetadata(gSPIRVMD::InOut,
+                                       *MDNode::get(*m_context, {ConstantAsMetadata::get(elementMetadata)}));
+    elements.push_back(replacementBuiltinVar);
+  }
+
+  // NOTE: Replace global variable users.
+  replaceGlobalBuiltinVar(globalBuiltinVar, elements);
+
+  // Cleans up unused newly created built-in global variables.
+  cleanUpUnusedGlobals(elements);
+
+  globalBuiltinVar->dropAllReferences();
+  globalBuiltinVar->eraseFromParent();
+  return;
+}
+
+// =====================================================================================================================
+// Splits a global variable of array type containing built-in elements into individual components.
+//
+// @param globalBuiltinVar : Global variable containing built-in type
+void ScalarReplacementOfBuiltins::splitBuiltinArray(GlobalVariable *globalBuiltinVar) {
+  assert(globalBuiltinVar->getValueType()->getArrayElementType()->isStructTy());
+  Type *arrayElemmentTy = globalBuiltinVar->getValueType()->getArrayElementType();
+  auto structureElementNum = arrayElemmentTy->getStructNumElements();
+  StringRef prefixName = globalBuiltinVar->getName();
+  auto arrayElementNum = globalBuiltinVar->getValueType()->getArrayNumElements();
+  SmallVector<User *> elements;
+  MDNode *globalVarMetaNode = globalBuiltinVar->getMetadata(gSPIRVMD::InOut);
+  assert(globalVarMetaNode);
+  Constant *inOutMetaConst = mdconst::dyn_extract<Constant>(globalVarMetaNode->getOperand(0));
+  Constant *structureMds = dyn_cast<Constant>(inOutMetaConst->getOperand(1));
+  auto int32Type = m_builder->getInt32Ty();
+  auto int64Type = m_builder->getInt64Ty();
+
+  for (int idx = 0; idx < structureElementNum; ++idx) {
+    Constant *memberMeta = dyn_cast<Constant>(structureMds->getOperand(idx));
+    assert(memberMeta && "memberMeta should not be null");
+
+    Type *memberElementTy = arrayElemmentTy->getStructElementType(idx);
+    ShaderInOutMetadata inOutMeta = getShaderInOutMetadata(memberElementTy, memberMeta);
+    auto builtInId = inOutMeta.Value;
+    ArrayType *replaceElementTy = ArrayType::get(memberElementTy, arrayElementNum);
+    // Note: This condition handles only built-in and location value types.
+    assert((inOutMeta.IsBuiltIn || inOutMeta.IsLoc) && "Expected built-in or location metadata");
+    StringRef builtinElementName = getBuiltinElementName(inOutMeta);
+
+    GlobalVariable *replaceBuiltinElement =
+        new GlobalVariable(*m_module, replaceElementTy, globalBuiltinVar->isConstant(), globalBuiltinVar->getLinkage(),
+                           nullptr, prefixName + builtinElementName, nullptr, globalBuiltinVar->getThreadLocalMode(),
+                           globalBuiltinVar->getType()->getAddressSpace());
+
+    ShaderInOutMetadata memberInOutMd = {};
+    memberInOutMd.IsBuiltIn = inOutMeta.IsBuiltIn;
+    memberInOutMd.IsLoc = inOutMeta.IsLoc;
+    memberInOutMd.Value = builtInId;
+
+    Type *elmdTy = memberMeta->getType();
+    StructType *mdTy = StructType::get(*m_context, {int32Type, elmdTy, int64Type, int64Type});
+    SmallVector<Constant *, 4> mdValues;
+    mdValues.push_back(ConstantInt::get(int32Type, 1));
+    mdValues.push_back(memberMeta);
+    mdValues.push_back(ConstantInt::get(int64Type, memberInOutMd.U64All[0]));
+    mdValues.push_back(ConstantInt::get(int64Type, memberInOutMd.U64All[1]));
+
+    Constant *mdVariable = ConstantStruct::get(mdTy, mdValues);
+    replaceBuiltinElement->addMetadata(gSPIRVMD::InOut,
+                                       *MDNode::get(*m_context, {ConstantAsMetadata::get(mdVariable)}));
+    elements.push_back(replaceBuiltinElement);
+  }
+
+  replaceGlobalBuiltinVar(globalBuiltinVar, elements);
+
+  // Cleans up unused newly created built-in global variables.
+  cleanUpUnusedGlobals(elements);
+  globalBuiltinVar->dropAllReferences();
+  globalBuiltinVar->eraseFromParent();
+  return;
+}
+
+} // namespace Llpc
diff --git a/llpc/lowering/ScalarReplacementOfBuiltins.h b/llpc/lowering/ScalarReplacementOfBuiltins.h
new file mode 100644
index 0000000000..20ae5787ed
--- /dev/null
+++ b/llpc/lowering/ScalarReplacementOfBuiltins.h
@@ -0,0 +1,59 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  ScalarReplacementOfBuiltins.h
+ * @brief LLPC header file: split and replace global variables that are structures containing built-in values.
+ ***********************************************************************************************************************
+ */
+#pragma once
+
+#include "Lowering.h"
+#include "SPIRVInternal.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/PassManager.h"
+
+namespace Llpc {
+
+// =====================================================================================================================
+// Pass that splits and replaces global variables that are structures containing built-in values
+class ScalarReplacementOfBuiltins : public SpirvLower, public llvm::PassInfoMixin<ScalarReplacementOfBuiltins> {
+public:
+  ScalarReplacementOfBuiltins() {}
+  llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
+
+  static llvm::StringRef name() { return "Scalar replacement of builtins"; }
+
+private:
+  ShaderInOutMetadata getShaderInOutMetadata(Type *elementType, Constant *elementMetadata);
+  bool needsSplit(GlobalVariable *builtinGlobalVar);
+  StringRef getBuiltinElementName(ShaderInOutMetadata &inOutMeta);
+  void cleanUpUnusedGlobals(SmallVector<User *> &elements);
+  void replaceGlobalBuiltinVar(GlobalVariable *builtinGlobalVar, SmallVector<User *> &elements);
+  void splitBuiltinStructure(GlobalVariable *builtinGlobalVar);
+  void splitBuiltinArray(GlobalVariable *builtinGlobalVar);
+};
+
+} // namespace Llpc
diff --git a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
index 014c4f991f..44f5f4b79f 100644
--- a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
+++ b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py
 
 ; BEGIN_SHADERTEST
-; RUN: amdllpc --print-after=llpc-spirv-lower-translator -filetype=asm -o - 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s
+; RUN: amdllpc --print-after=lower-translator -filetype=asm -o - 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s
 ; #version 450
 ; #extension GL_EXT_nonuniform_qualifier : require
 ; #extension GL_ARB_gpu_shader_int64 : require
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp
index 90659fc955..24e47f5333 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp
@@ -81,7 +81,7 @@ void main()
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16(i32 9, i32 8, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.i32(float 9.000000e+00, i32 7, i32 7, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.{{i32|i16}}(float 9.000000e+00, {{i32|i16}} 7, {{i32|i16}} 7, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag
index e5eb6a602c..a9e619ecc8 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag
@@ -83,7 +83,7 @@ void main()
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.cube.i32.i16(i32 %{{.*}}, i16 2, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.cube.i32.i16(i32 %{{.*}}, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.cube.i32.i16(i32 %{{.*}}, i32 17, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.i32(float %{{[-0-9A-Za0z_.]+}}, i32 3, i32 3, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.{{i32|i16}}(float %{{[-0-9A-Za0z_.]+}}, {{i32|i16}} 3, {{i32|i16}} 3, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp
index d10801438c..72027ec915 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp
@@ -61,7 +61,6 @@ void main()
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; REQUIRES: do-not-run-me
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: @{{.*}} = addrspace(3) global i32
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp
index 2a3e1af505..b7727a2673 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp
@@ -80,7 +80,6 @@ void main ()
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; REQUIRES: do-not-run-me
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: atomicrmw umin ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw umax ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
diff --git a/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp b/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp
index b27b475c72..8058ea943b 100644
--- a/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp
+++ b/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp
@@ -1,5 +1,6 @@
 // NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py
 // RUN: amdllpc -o - -filetype=asm %s | FileCheck -check-prefixes=CHECK %s
+// REQUIRES: do-not-run-me
 
 #version 450
 #extension GL_KHR_shader_subgroup_arithmetic : require
@@ -27,12 +28,9 @@ void main() {
 // CHECK-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 // CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 // CHECK-NEXT:    buffer_load_dword v5, v4, s[4:7], 0 offen
-// CHECK-NEXT:    s_waitcnt vmcnt(0)
-// CHECK-NEXT:    v_mov_b32_e32 v0, v5
-// CHECK-NEXT:    s_not_b64 exec, exec
-// CHECK-NEXT:    v_mov_b32_e32 v0, 0xff800000
-// CHECK-NEXT:    s_not_b64 exec, exec
 // CHECK-NEXT:    s_or_saveexec_b64 s[0:1], -1
+// CHECK-NEXT:    s_waitcnt vmcnt(0)
+// CHECK-NEXT:    v_cndmask_b32_e64 v0, 0xff800000, v5, s[0:1]
 // CHECK-NEXT:    v_max_f32_dpp v0, v0, v0 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
 // CHECK-NEXT:    v_max_f32_dpp v0, v0, v0 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 // CHECK-NEXT:    v_max_f32_dpp v0, v0, v0 row_half_mirror row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -44,18 +42,13 @@ void main() {
 // CHECK-NEXT:    v_max_f32_e64 v0, s2, s3
 // CHECK-NEXT:    s_mov_b64 exec, s[0:1]
 // CHECK-NEXT:    v_mov_b32_e32 v5, v0
-// CHECK-NEXT:    s_or_saveexec_b64 s[0:1], -1
-// CHECK-NEXT:    v_mov_b32_e32 v0, 0xff800000
-// CHECK-NEXT:    s_mov_b64 exec, s[0:1]
-// CHECK-NEXT:    v_mov_b32_e32 v1, v5
-// CHECK-NEXT:    s_not_b64 exec, exec
-// CHECK-NEXT:    v_mov_b32_e32 v1, 0xff800000
-// CHECK-NEXT:    s_not_b64 exec, exec
 // CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
-// CHECK-NEXT:    v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+// CHECK-NEXT:    v_mov_b32_e32 v0, 0xff800000
+// CHECK-NEXT:    v_cndmask_b32_e64 v1, 0xff800000, v5, s[2:3]
 // CHECK-NEXT:    v_mov_b32_e32 v2, 0xff800000
 // CHECK-NEXT:    s_mov_b32 vcc_lo, 0xffff0000
 // CHECK-NEXT:    s_mov_b32 vcc_hi, vcc_lo
+// CHECK-NEXT:    v_mov_b32_dpp v0, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 // CHECK-NEXT:    v_max_f32_e32 v0, v1, v0
 // CHECK-NEXT:    v_mov_b32_e32 v1, 0xff800000
 // CHECK-NEXT:    v_mov_b32_dpp v2, v0 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -76,15 +69,12 @@ void main() {
 // CHECK-NEXT:    v_max_f32_e32 v0, v0, v1
 // CHECK-NEXT:    s_mov_b64 exec, s[2:3]
 // CHECK-NEXT:    v_mov_b32_e32 v5, v0
-// CHECK-NEXT:    v_mov_b32_e32 v0, v5
-// CHECK-NEXT:    s_not_b64 exec, exec
-// CHECK-NEXT:    v_mov_b32_e32 v0, 0xff800000
-// CHECK-NEXT:    s_not_b64 exec, exec
 // CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
+// CHECK-NEXT:    v_cndmask_b32_e64 v0, 0xff800000, v5, s[8:9]
 // CHECK-NEXT:    s_mov_b32 s2, 0x6543210f
 // CHECK-NEXT:    v_mov_b32_e32 v2, 0xff800000
-// CHECK-NEXT:    v_permlane16_b32 v0, v0, s2, 0xedcba987 op_sel:[1,0]
 // CHECK-NEXT:    v_mov_b32_e32 v3, 0xff800000
+// CHECK-NEXT:    v_permlane16_b32 v0, v0, s2, 0xedcba987 op_sel:[1,0]
 // CHECK-NEXT:    v_readlane_b32 s2, v0, 16
 // CHECK-NEXT:    v_writelane_b32 v0, s2, 48
 // CHECK-NEXT:    s_mov_b32 s2, 0xff800000
@@ -114,11 +104,8 @@ void main() {
 // CHECK-NEXT:    v_max_f32_e32 v0, v0, v1
 // CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 // CHECK-NEXT:    v_mov_b32_e32 v5, v0
-// CHECK-NEXT:    v_mov_b32_e32 v0, v5
-// CHECK-NEXT:    s_not_b64 exec, exec
-// CHECK-NEXT:    v_mov_b32_e32 v0, 0xff800000
-// CHECK-NEXT:    s_not_b64 exec, exec
 // CHECK-NEXT:    s_or_saveexec_b64 s[0:1], -1
+// CHECK-NEXT:    v_cndmask_b32_e64 v0, 0xff800000, v5, s[0:1]
 // CHECK-NEXT:    v_max_f32_dpp v0, v0, v0 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf bound_ctrl:1
 // CHECK-NEXT:    v_max_f32_dpp v0, v0, v0 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 // CHECK-NEXT:    v_max_f32_dpp v0, v0, v0 row_half_mirror row_mask:0xf bank_mask:0xf bound_ctrl:1
diff --git a/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert b/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert
index 18f2387123..76a6e50288 100644
--- a/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert
+++ b/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert
@@ -22,8 +22,8 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=WITHOUT_IIE %s
 ; WITHOUT_IIE-LABEL: {{^// LLPC}} pipeline before-patching results
 ; WITHOUT_IIE: %[[val:.*]] = extractvalue [4 x <4 x float>] %{{.*}}, 3
-; WITHOUT_IIE: %[[mul:.*]] = fmul <4 x float> %[[val]], %{{.*}}
-; WITHOUT_IIE: %[[arg:.*]] = fadd <4 x float> %{{.*}}, %[[mul]]
+; WITHOUT_IIE: %[[mul:.*]] = fmul nnan nsz <4 x float> %[[val]], %{{.*}}
+; WITHOUT_IIE: %[[arg:.*]] = fadd nnan nsz <4 x float> %{{.*}}, %[[mul]]
 ; WITHOUT_IIE-NEXT: call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> %[[arg]])
 ; WITHOUT_IIE: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
index 30c0a02033..ec588303a7 100644
--- a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
+++ b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
@@ -29,8 +29,8 @@ void main()
 //
 //.
 // CHECK: attributes #[[ATTR0]] = { alwaysinline nounwind "denormal-fp-math-f32"="preserve-sign" }
-// CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
-// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn memory(read) }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
 //.
 // CHECK: [[META0:![0-9]+]] = !{!"Vulkan"}
 // CHECK: [[META1:![0-9]+]] = !{i32 1}
diff --git a/llpc/test/shaderdb/extensions/ExtShaderInt64_TestRelationalOp_lit.frag b/llpc/test/shaderdb/extensions/ExtShaderInt64_TestRelationalOp_lit.frag
index 4fb6f2cbf6..bb5734d317 100644
--- a/llpc/test/shaderdb/extensions/ExtShaderInt64_TestRelationalOp_lit.frag
+++ b/llpc/test/shaderdb/extensions/ExtShaderInt64_TestRelationalOp_lit.frag
@@ -1,6 +1,5 @@
 // NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py
 // RUN: amdllpc -emit-lgc -gfxip 10.3 -o - %s | FileCheck -check-prefix=SHADERTEST %s
-// REQUIRES: do-not-run-me
 
 #version 450
 
@@ -37,9 +36,9 @@ void main()
 // SHADERTEST-NEXT:  .entry:
 // SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
-// SHADERTEST-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 32
+// SHADERTEST-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP0]], i32 32
 // SHADERTEST-NEXT:    [[TMP3:%.*]] = load <3 x i64>, ptr addrspace(7) [[TMP2]], align 32
-// SHADERTEST-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 64
+// SHADERTEST-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP0]], i32 64
 // SHADERTEST-NEXT:    [[TMP5:%.*]] = load <3 x i64>, ptr addrspace(7) [[TMP4]], align 32
 // SHADERTEST-NEXT:    [[TMP6:%.*]] = extractelement <3 x i64> [[TMP3]], i64 0
 // SHADERTEST-NEXT:    [[TMP7:%.*]] = extractelement <3 x i64> [[TMP5]], i64 0
@@ -114,7 +113,7 @@ void main()
 // SHADERTEST:       67:
 // SHADERTEST-NEXT:    [[DOT022_IN:%.*]] = phi <3 x i1> [ [[TMP51]], [[TMP37]] ], [ [[TMP66]], [[TMP52]] ]
 // SHADERTEST-NEXT:    [[TMP68:%.*]] = load i64, ptr addrspace(7) [[TMP0]], align 8
-// SHADERTEST-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 8
+// SHADERTEST-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP0]], i32 8
 // SHADERTEST-NEXT:    [[TMP70:%.*]] = load i64, ptr addrspace(7) [[TMP69]], align 8
 // SHADERTEST-NEXT:    [[TMP71:%.*]] = icmp ne i64 [[TMP68]], [[TMP70]]
 // SHADERTEST-NEXT:    [[COND_FREEZE4:%.*]] = freeze i1 [[TMP71]]
diff --git a/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp b/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp
index 48598a48f4..ab2da40227 100644
--- a/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp
+++ b/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp
@@ -42,7 +42,6 @@ void main()
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -enable-load-scalarizer=false -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; REQUIRES: do-not-run-me
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST-COUNT-4: getelementptr {{.*}}[4 x { i8, <2 x i8>, <3 x i8>, <4 x i8> }], ptr addrspace(3) @{{.*}}, i32 0, i32 {{%?[0-9]+}}, i32 {{[0-3]}}
diff --git a/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport.spvasm b/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport.spvasm
new file mode 100644
index 0000000000..a191df60db
--- /dev/null
+++ b/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport.spvasm
@@ -0,0 +1,50 @@
+; SPIR-V
+; Version: 1.6
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 19
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Fragment %main "main" %_
+               OpExecutionMode %main OriginUpperLeft
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %UniformData "UniformData"
+               OpMemberName %UniformData 0 "valueNonZero"
+               OpName %_ ""
+               OpMemberDecorate %UniformData 0 Offset 0
+               OpDecorate %UniformData Block
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+%UniformData = OpTypeStruct %int
+%_ptr_Uniform_UniformData = OpTypePointer Uniform %UniformData
+          %_ = OpVariable %_ptr_Uniform_UniformData Uniform
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_int = OpTypePointer Uniform %int
+       %bool = OpTypeBool
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %12 = OpAccessChain %_ptr_Uniform_int %_ %int_0
+         %13 = OpLoad %int %12
+         %15 = OpINotEqual %bool %13 %int_0
+               OpSelectionMerge %17 None
+               OpBranchConditional %15 %16 %17
+         %16 = OpLabel
+               OpTerminateInvocation
+         %17 = OpLabel
+               OpReturn
+               OpFunctionEnd
+
+; BEGIN_SHADERTEST
+; RUN: amdllpc -v %gfxip %s | FileCheck --check-prefix=SHADERTEST %s
+; SHADERTEST-LABEL: {{^//}} LLPC final ELF info
+; SHADERTEST: .cb_shader_mask:
+; SHADERTEST-NEXT:     .output0_enable: 0x0000000000000001
+; SHADERTEST: .spi_shader_col_format:
+; SHADERTEST-NEXT:     .col_0_export_format: 0x0000000000000001
+; SHADERTEST: AMDLLPC SUCCESS
+; END_SHADERTEST
diff --git a/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport4.pipe b/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport4.pipe
new file mode 100644
index 0000000000..e71c1b5a7f
--- /dev/null
+++ b/llpc/test/shaderdb/general/CbShaderMaskWithDummyExport4.pipe
@@ -0,0 +1,65 @@
+[Version]
+version = 75
+
+[VsGlsl]
+#version 450 core
+void main() {}
+
+[VsInfo]
+entryPoint = main
+
+[FsGlsl]
+#version 450 core
+layout(set = 0, binding = 0, std140) uniform UniformData
+{
+    int valueNonZero;
+};
+
+void main() {
+  if (valueNonZero != 0)
+    discard;
+}
+
+[FsInfo]
+entryPoint = main
+
+[GraphicsPipelineState]
+colorBuffer[0].format = VK_FORMAT_R8G8B8A8_SRGB
+colorBuffer[0].channelWriteMask = 7
+colorBuffer[0].blendEnable = 0
+colorBuffer[0].blendSrcAlphaToColor = 0
+
+[ResourceMapping]
+descriptorRangeValue[0].visibility = 66
+descriptorRangeValue[0].type = DescriptorConstBuffer
+descriptorRangeValue[0].set = 0
+descriptorRangeValue[0].binding = 0
+descriptorRangeValue[0].arraySize = 1
+descriptorRangeValue[0].uintData = 134217874, 16773120, 603979776, 0
+
+userDataNode[0].visibility = 2
+userDataNode[0].type = IndirectUserDataVaPtr
+userDataNode[0].offsetInDwords = 0
+userDataNode[0].sizeInDwords = 1
+userDataNode[0].indirectUserDataCount = 4
+userDataNode[1].visibility = 66
+userDataNode[1].type = DescriptorTableVaPtr
+userDataNode[1].offsetInDwords = 8
+userDataNode[1].sizeInDwords = 1
+userDataNode[1].next[0].type = DescriptorConstBuffer
+userDataNode[1].next[0].offsetInDwords = 0
+userDataNode[1].next[0].sizeInDwords = 4
+userDataNode[1].next[0].set = 0x00000000
+userDataNode[1].next[0].binding = 0
+userDataNode[1].next[0].strideInDwords = 8
+
+
+; BEGIN_SHADERTEST
+; RUN: amdllpc -v %gfxip %s | FileCheck --check-prefix=SHADERTEST %s
+; SHADERTEST-LABEL: {{^//}} LLPC final ELF info
+; SHADERTEST: .cb_shader_mask:
+; SHADERTEST-NEXT:     .output0_enable: 0x000000000000000F
+; SHADERTEST: .spi_shader_col_format:
+; SHADERTEST-NEXT:     .col_0_export_format: 0x0000000000000004
+; SHADERTEST: AMDLLPC SUCCESS
+; END_SHADERTEST
diff --git a/llpc/test/shaderdb/general/PipelineGsTess_TestInOutPacking.pipe b/llpc/test/shaderdb/general/PipelineGsTess_TestInOutPacking.pipe
index a371688436..133d0d4913 100644
--- a/llpc/test/shaderdb/general/PipelineGsTess_TestInOutPacking.pipe
+++ b/llpc/test/shaderdb/general/PipelineGsTess_TestInOutPacking.pipe
@@ -17,11 +17,11 @@
 ; SHADERTEST: (GS) Output: stream = 0,  [location, component] = [2, 1]  =>  Mapped = [1, 3]
 ; SHADERTEST: (GS) Output: stream = 0,  [location, component] = [4, 0]  =>  Mapped = [2, 0]
 ; SHADERTEST: (GS) Output: stream = 0,  [location, component] = [4, 1]  =>  Mapped = [2, 1]
-; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [3, 0]  =>  Mapped = [2, 0]
-; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [3, 1]  =>  Mapped = [2, 1]
-; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [3, 2]  =>  Mapped = [2, 2]
-; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [3, 3]  =>  Mapped = [2, 3]
-; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [4, 3]  =>  Mapped = [3, 0]
+; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [3, 0]  =>  Mapped = [0, 0]
+; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [3, 1]  =>  Mapped = [0, 1]
+; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [3, 2]  =>  Mapped = [0, 2]
+; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [3, 3]  =>  Mapped = [0, 3]
+; SHADERTEST: (GS) Output: stream = 1,  [location, component] = [4, 3]  =>  Mapped = [1, 0]
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 32, i32 15
 ; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 33, i32 15
diff --git a/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe b/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe
index d432d16530..87b36861ba 100644
--- a/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe
+++ b/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe
@@ -2,7 +2,7 @@
 
 ; BEGIN_SHADERTEST
 ; REQUIRES: gpurt
-; RUN: amdllpc --print-after=llpc-spirv-lower-translator -gfxip 10.3 -o /dev/null 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s
+; RUN: amdllpc --print-after=lower-translator -gfxip 10.3 -o /dev/null 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s
 ; SHADERTEST-LABEL: @main(
 ; SHADERTEST: call void (...) @lgc.rt.trace.ray(i64 %{{[0-9]+}}, i32 0, i32 %{{[0-9]+}}, i32 0, i32 0, i32 0, <3 x float> %{{[0-9]+}}, float %{{[0-9]+}}, <3 x float> %{{[0-9]+}}, float %{{[0-9]+}}, ptr addrspace(5) @RayPayloadKHR0, [1 x i32] [i32 16])
 ; END_SHADERTEST
diff --git a/llpc/test/shaderdb/general/PipelineTess_XfbWithManyComponents.pipe b/llpc/test/shaderdb/general/PipelineTess_XfbWithManyComponents.pipe
index 468e4376e0..4533eca767 100644
--- a/llpc/test/shaderdb/general/PipelineTess_XfbWithManyComponents.pipe
+++ b/llpc/test/shaderdb/general/PipelineTess_XfbWithManyComponents.pipe
@@ -5,8 +5,8 @@
 ; BEGIN_SHADERTEST
 ; RUN: amdllpc -v -gfxip=11 %s | FileCheck -check-prefix=SHADERTEST %s
 
-; SHADERTEST-LABEL: LLPC geometry calculation factor results
-; SHADERTEST: ES-GS ring item size (in dwords): 129
+; SHADERTEST-LABEL: LLPC HW GS configurations
+; SHADERTEST: EsGsRingItemSize = 129 dwords
 
 ; SHADERTEST-LABEL: .fetchXfbOutput
 ; Write v4[31] = 4.0 -> LDS
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe b/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe
index 425456fb11..61e32230f7 100644
--- a/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe
+++ b/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --function-signature
-; RUN: amdllpc -stop-after=lgc-patch-entry-point-mutate -o - %s | FileCheck -check-prefixes=SHADERTEST %s
+; RUN: amdllpc -stop-after=lgc-mutate-entry-point -o - %s | FileCheck -check-prefixes=SHADERTEST %s
 [Version]
 version = 64
 
@@ -115,7 +115,6 @@ attribute[1].offset = 16
 ; SHADERTEST-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP24]], i32 3
 ; SHADERTEST-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP31]], i32 3
 ; SHADERTEST-NEXT:    [[VERTEX0_0:%.*]] = bitcast <4 x i32> [[TMP32]] to <4 x float>
-; SHADERTEST-NEXT:    call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> [[VERTEX0_0]]) #[[ATTR7:[0-9]+]]
 ; SHADERTEST-NEXT:    [[TMP33:%.*]] = extractelement <2 x float> [[VERTEX1_0]], i64 0
 ; SHADERTEST-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[VERTEX1_0]], i64 1
 ; SHADERTEST-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[VERTEX1_0]], i64 0
@@ -132,7 +131,8 @@ attribute[1].offset = 16
 ; SHADERTEST-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP45]], i64 2
 ; SHADERTEST-NEXT:    [[TMP47:%.*]] = bitcast i32 [[TMP40]] to float
 ; SHADERTEST-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP47]], i64 3
-; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP48]]) #[[ATTR7]]
+; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP48]]) #[[ATTR7:[0-9]+]]
+; SHADERTEST-NEXT:    call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> [[VERTEX0_0]]) #[[ATTR7]]
 ; SHADERTEST-NEXT:    ret void
 ;
 ;
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe b/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe
index ac195358cc..bdbb83a8e3 100644
--- a/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe
+++ b/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe
@@ -2,9 +2,11 @@
 // instruction combine removing subtraction for gl_Position computation.
 
 ; BEGIN_SHADERTEST
-; RUN: amdllpc --gfxip=10.3.0 -v %s | FileCheck -check-prefix=SHADERTEST %s
+; RUN: amdllpc --gfxip=10.3.0 -v %s | FileCheck -check-prefixes=SHADERTEST,OPT %s
+; RUN: amdllpc --gfxip=10.3.0 --disable-gl-position-opt=1 -v %s | FileCheck -check-prefixes=SHADERTEST,NOOPT %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: fsub float 1.000000e+00, %__llpc_input_proxy_in_Pos.0.vec.extract
+; OPT: fsub nnan nsz float 1.000000e+00, %__llpc_input_proxy_in_Pos.0.vec.extract
+; NOOPT: fsub float 1.000000e+00, %__llpc_input_proxy_in_Pos.0.vec.extract
 ; SHADERTEST-LABEL: _amdgpu_vs_main:
 ; SHADERTEST: v_sub_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 ; END_SHADERTEST
@@ -51,5 +53,5 @@ binding[0].stride = 2
 binding[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX
 attribute[0].location = 0
 attribute[0].binding = 0
-attribute[0].format = VK_FORMAT_R8G8_SNORM
+attribute[0].format = VK_FORMAT_R8G8B8A8_SNORM
 attribute[0].offset = 0
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestBarycentric_tri_fan.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestBarycentric_tri_fan.pipe
index 8d23a38228..5bad45c791 100644
--- a/llpc/test/shaderdb/general/PipelineVsFs_TestBarycentric_tri_fan.pipe
+++ b/llpc/test/shaderdb/general/PipelineVsFs_TestBarycentric_tri_fan.pipe
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --function amdgpu_ps_main
 ; RUN: amdllpc -filetype=asm -gfxip=10.3 -o - %s | FileCheck -check-prefix=SHADERTEST %s
-; REQUIRES: do-not-run
 
 ; This test tests barycentric coordinate when topology is VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN
 ; barycentric coordinate: (i ,j , 1 - i - j).
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe
index c1c9296094..5b7d55e278 100644
--- a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe
+++ b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe
@@ -217,88 +217,4 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:         .wgp_mode:       false
 ; CHECK-NEXT:         .writes_depth:   0
 ; CHECK-NEXT:         .writes_uavs:    false
-; CHECK-NEXT:       .vs:
-; CHECK-NEXT:         .checksum_value: 0xba71f629
-; CHECK-NEXT:         .debug_mode:     false
-; CHECK-NEXT:         .entry_point:    _amdgpu_vs_main
-; CHECK-NEXT:         .float_mode:     0xc0
-; CHECK-NEXT:         .ieee_mode:      false
-; CHECK:         .mem_ordered:    true
-; CHECK-NEXT:         .scratch_en:     false
-; CHECK-NEXT:         .scratch_memory_size: 0
-; CHECK-NEXT:         .sgpr_count:     0x3
-; CHECK-NEXT:         .sgpr_limit:     0x6a
-; CHECK-NEXT:         .trap_present:   0
-; CHECK-NEXT:         .user_data_reg_map:
-; CHECK-NEXT:           - 0x10000000
-; CHECK-NEXT:           - 0x10000003
-; CHECK-NEXT:           - 0x10000004
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:           - 0xffffffff
-; CHECK-NEXT:         .user_sgprs:     0x3
-; CHECK-NEXT:         .vgpr_count:     0x4
-; CHECK-NEXT:         .vgpr_limit:     0x100
-; CHECK-NEXT:         .wavefront_size: 0x20
-; CHECK-NEXT:         .wgp_mode:       false
-; CHECK-NEXT:     .internal_pipeline_hash:
-; CHECK-NEXT:       - 0x{{[0-9a-f]+}}
-; CHECK-NEXT:       - 0x{{[0-9a-f]+}}
-; CHECK-NEXT:     .num_interpolants: 0x1
-; CHECK-NEXT:     .registers:      {}
-; CHECK-NEXT:     .shaders:
-; CHECK-NEXT:       .pixel:
-; CHECK-NEXT:         .api_shader_hash:
-; CHECK-NEXT:           - 0
-; CHECK-NEXT:           - 0
-; CHECK-NEXT:         .hardware_mapping:
-; CHECK-NEXT:           - .ps
-; CHECK-NEXT:       .vertex:
-; CHECK-NEXT:         .api_shader_hash:
-; CHECK-NEXT:           - 0x{{[0-9a-f]+}}
-; CHECK-NEXT:           - 0
-; CHECK-NEXT:         .hardware_mapping:
-; CHECK-NEXT:           - .vs
-; CHECK-NEXT:     .spill_threshold: 0xffff
-; CHECK-NEXT:     .streamout_vertex_strides:
-; CHECK-NEXT:       - 0
-; CHECK-NEXT:       - 0
-; CHECK-NEXT:       - 0
-; CHECK-NEXT:       - 0
-; CHECK-NEXT:     .type:           VsPs
-; CHECK-NEXT:     .user_data_limit: 0x1
-; CHECK-NEXT:     .xgl_cache_info:
-; CHECK-NEXT:       .128_bit_cache_hash:
-; CHECK-NEXT:         - 0x{{[0-9a-f]+}}
-; CHECK-NEXT:         - 0x{{[0-9a-f]+}}
-; CHECK-NEXT:       .llpc_version: {{.*}}
-; CHECK-NEXT: amdpal.version:
-; CHECK-NEXT:   - 0x3
-; CHECK-NEXT:   - 0
-; CHECK-NEXT: ...
+;
diff --git a/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm b/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm
index a4c8ce1f79..804a467c3f 100644
--- a/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm
+++ b/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm
@@ -4,7 +4,6 @@
 
 ; BEGIN_SHADERTEST
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; REQUIRES: do-not-run-me
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: @[[LDS0:[^ ]*]] = addrspace(3) global <{ [8 x i32] }> poison, align 4
 ; SHADERTEST: @[[LDS1:[^ ]*]] = addrspace(3) global <{ [4 x i32] }> poison, align 4
@@ -14,9 +13,9 @@
 ; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}([4 x i32], ptr addrspace(3) @[[LDS1]], i32 0, i32 2), align 4
 ; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}([4 x i32], ptr addrspace(3) @[[LDS1]], i32 0, i32 3), align 4
 ; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1), align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1, i32 1), align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1, i32 2), align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1, i32 3), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1), i32 0, i32 1), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1), i32 0, i32 2), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [16 x i8], [4 x i32] }>, ptr addrspace(3) @[[LDS2]], i32 0, i32 1), i32 0, i32 3), align 4
 ; SHADERTEST: load i32, ptr addrspace(3) @[[LDS0]], align 4
 ; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}([8 x i32], ptr addrspace(3) @[[LDS0]], i32 0, i32 1), align 4
 ; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}([8 x i32], ptr addrspace(3) @[[LDS0]], i32 0, i32 2), align 4
diff --git a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
index 4962809bbb..b68f5d6674 100644
--- a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
+++ b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
@@ -187,144 +187,4 @@ entryPoint = main
 ; SHADERTEST-NEXT:       .vs_so_base2_en: false
 ; SHADERTEST-NEXT:       .vs_so_base3_en: false
 ; SHADERTEST-NEXT:       .vs_streamout_en: false
-; SHADERTEST-NEXT:     .hardware_stages:
-; SHADERTEST-NEXT:       .ps:
-; SHADERTEST-NEXT:         .checksum_value: 0x4658ef51
-; SHADERTEST-NEXT:         .debug_mode:     false
-; SHADERTEST-NEXT:         .entry_point:    _amdgpu_ps_main
-; SHADERTEST-NEXT:         .float_mode:     0xc0
-; SHADERTEST-NEXT:         .ieee_mode:      false
-; SHADERTEST-NEXT:         .lds_size:       0
-; SHADERTEST-NEXT:         .mem_ordered:    true
-; SHADERTEST-NEXT:         .scratch_en:     false
-; SHADERTEST-NEXT:         .scratch_memory_size: 0
-; SHADERTEST-NEXT:         .sgpr_count:     0x2
-; SHADERTEST-NEXT:         .sgpr_limit:     0x6a
-; SHADERTEST-NEXT:         .trap_present:   0
-; SHADERTEST-NEXT:         .user_data_reg_map:
-; SHADERTEST-NEXT:           - 0x10000000
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:         .user_sgprs:     0x1
-; SHADERTEST-NEXT:         .uses_uavs:      false
-; SHADERTEST-NEXT:         .vgpr_count:     0x2
-; SHADERTEST-NEXT:         .vgpr_limit:     0x100
-; SHADERTEST-NEXT:         .wavefront_size: 0x40
-; SHADERTEST-NEXT:         .wgp_mode:       false
-; SHADERTEST-NEXT:         .writes_depth:   0
-; SHADERTEST-NEXT:         .writes_uavs:    false
-; SHADERTEST-NEXT:       .vs:
-; SHADERTEST-NEXT:         .checksum_value: 0xd2536693
-; SHADERTEST-NEXT:         .debug_mode:     false
-; SHADERTEST-NEXT:         .entry_point:    _amdgpu_vs_main
-; SHADERTEST-NEXT:         .float_mode:     0xc0
-; SHADERTEST-NEXT:         .ieee_mode:      false
-; SHADERTEST-NEXT:         .lds_size:       0
-; SHADERTEST-NEXT:         .mem_ordered:    true
-; SHADERTEST-NEXT:         .scratch_en:     false
-; SHADERTEST-NEXT:         .scratch_memory_size: 0
-; SHADERTEST-NEXT:         .sgpr_count:     0x3
-; SHADERTEST-NEXT:         .sgpr_limit:     0x6a
-; SHADERTEST-NEXT:         .trap_present:   0
-; SHADERTEST-NEXT:         .user_data_reg_map:
-; SHADERTEST-NEXT:           - 0x10000000
-; SHADERTEST-NEXT:           - 0x10000003
-; SHADERTEST-NEXT:           - 0x10000004
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:           - 0xffffffff
-; SHADERTEST-NEXT:         .user_sgprs:     0x3
-; SHADERTEST-NEXT:         .vgpr_count:     0x4
-; SHADERTEST-NEXT:         .vgpr_limit:     0x100
-; SHADERTEST-NEXT:         .wavefront_size: 0x20
-; SHADERTEST-NEXT:         .wgp_mode:       false
-; SHADERTEST-NEXT:     .internal_pipeline_hash:
-; SHADERTEST-NEXT:       - 0x{{[0-9a-f]+}}
-; SHADERTEST-NEXT:       - 0x{{[0-9a-f]+}}
-; SHADERTEST-NEXT:     .num_interpolants: 0x1
-; SHADERTEST-NEXT:     .registers:      {}
-; SHADERTEST-NEXT:     .shaders:
-; SHADERTEST-NEXT:       .pixel:
-; SHADERTEST-NEXT:         .api_shader_hash:
-; SHADERTEST-NEXT:           - 0x{{[0-9a-f]+}}
-; SHADERTEST-NEXT:           - 0
-; SHADERTEST-NEXT:         .hardware_mapping:
-; SHADERTEST-NEXT:           - .ps
-; SHADERTEST-NEXT:       .vertex:
-; SHADERTEST-NEXT:         .api_shader_hash:
-; SHADERTEST-NEXT:           - 0x{{[0-9a-f]+}}
-; SHADERTEST-NEXT:           - 0
-; SHADERTEST-NEXT:         .hardware_mapping:
-; SHADERTEST-NEXT:           - .vs
-; SHADERTEST-NEXT:     .spill_threshold: 0xffff
-; SHADERTEST-NEXT:     .streamout_vertex_strides:
-; SHADERTEST-NEXT:       - 0
-; SHADERTEST-NEXT:       - 0
-; SHADERTEST-NEXT:       - 0
-; SHADERTEST-NEXT:       - 0
-; SHADERTEST-NEXT:     .type:           VsPs
-; SHADERTEST-NEXT:     .user_data_limit: 0x1
-; SHADERTEST-NEXT:     .xgl_cache_info:
-; SHADERTEST-NEXT:       .128_bit_cache_hash:
-; SHADERTEST-NEXT:         - 0x{{[0-9a-f]+}}
-; SHADERTEST-NEXT:         - 0x{{[0-9a-f]+}}
-; SHADERTEST-NEXT:       .llpc_version: {{.*}}
-; SHADERTEST-NEXT: amdpal.version:
-; SHADERTEST-NEXT:   - 0x3
-; SHADERTEST-NEXT:   - 0
-; SHADERTEST-NEXT: ...
+;
diff --git a/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe b/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe
index 8c25a3d7c9..25d6e37870 100644
--- a/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe
+++ b/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe
@@ -1,12 +1,11 @@
 ; Test to check that the optimization of tessellation factors store are handled as expected
-; REQUIRES: do-not-run-me
 ; RUN: amdllpc %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: @_amdgpu_hs_main(
 
 ; SHADERTEST-LABEL: .distribHsPatchCount:
 ; SHADERTEST-NEXT: %[[HS_PATCH_COUNT_SHIFT:[^ ,]*]] = lshr i32 %mergeWaveInfo, 16
 ; SHADERTEST-NEXT: %[[HS_PATCH_COUNT:[^ ,]*]] = and i32 %[[HS_PATCH_COUNT_SHIFT]], 255
-; SHADERTEST-NEXT: store i32 %[[HS_PATCH_COUNT]], ptr addrspace(3) getelementptr inbounds ([649 x i32], ptr addrspace(3) @Lds.HS, i32 0, i32 640), align 4
+; SHADERTEST-NEXT: store i32 %[[HS_PATCH_COUNT]], ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @Lds.HS, i32 2560), align 4
 ; SHADERTEST-NEXT: br label %.endDistribHsPatchCount
 
 ; SHADERTEST-LABEL: .endDistribHsPatchCount:
@@ -17,18 +16,18 @@
 ; SHADERTEST-NEXT: br i1 %validHsVert, label %.beginHs, label %.endHs
 
 ; SHADERTEST-LABEL: .endHs:
-; SHADERTEST: %[[HS_PATCH_COUNT:[^ ,]*]] = load i32, ptr addrspace(3) getelementptr inbounds ([649 x i32], ptr addrspace(3) @Lds.HS, i32 0, i32 640), align 4
-; SHADERTEST: %hsPatchCount = call i32 @llvm.amdgcn.readfirstlane(i32 %[[HS_PATCH_COUNT]])
+; SHADERTEST: %[[HS_PATCH_COUNT:[^ ,]*]] = load i32, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @Lds.HS, i32 2560), align 4
+; SHADERTEST: %hsPatchCount = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %[[HS_PATCH_COUNT]])
 ; SHADERTEST: %validHsPatch = icmp ult i32 %threadIdInGroup, %hsPatchCount
 ; SHADERTEST: br i1 %validHsPatch, label %.checkSpecialTfInWave, label %.endCheckSpecialTfInWave
 
 ; SHADERTEST-LABEL: .checkSpecialTfInWave:
-; SHADERTEST-NEXT: %[[OUTER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 6
-; SHADERTEST-NEXT: %[[OUTER_TF_I_PTR:[^ ,]*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 %[[OUTER_TF_OFFSET_0]]
+; SHADERTEST-NEXT: %[[OUTER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 24
+; SHADERTEST-NEXT: %[[OUTER_TF_I_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 %[[OUTER_TF_OFFSET_0]]
 ; SHADERTEST-NEXT: %[[OUTER_TF_PTR:[^ ,]*]] = getelementptr {{(i8|i32)}}, ptr addrspace(3) %[[OUTER_TF_I_PTR]], i32 {{(256|1024)}}
 ; SHADERTEST-NEXT: %[[OUTER_TF:[^ ,]*]] = load <4 x float>, ptr addrspace(3) %[[OUTER_TF_PTR]], align 4
-; SHADERTEST-NEXT: %[[INNER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 6
-; SHADERTEST-NEXT: %[[INNER_TF_I_PTR:[^ ,]*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 %[[INNER_TF_OFFSET_0]]
+; SHADERTEST-NEXT: %[[INNER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 24
+; SHADERTEST-NEXT: %[[INNER_TF_I_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 %[[INNER_TF_OFFSET_0]]
 ; SHADERTEST-NEXT: %[[INNER_TF_PTR:[^ ,]*]] = getelementptr {{(i8|i32)}}, ptr addrspace(3) %[[INNER_TF_I_PTR]], i32 {{(260|1040)}}
 ; SHADERTEST-NEXT: %[[INNER_TF:[^ ,]*]] = load <2 x float>, ptr addrspace(3) %[[INNER_TF_PTR]], align 4
 ; SHADERTEST-NEXT: %[[OUTER_TF_0:[^ ,]*]] = extractelement <4 x float> %[[OUTER_TF]], i64 0
diff --git a/llpc/test/shaderdb/gfx11/TestGdsOperationsForXfb.vert b/llpc/test/shaderdb/gfx11/TestGdsOperationsForXfb.vert
new file mode 100644
index 0000000000..d1d3d65d68
--- /dev/null
+++ b/llpc/test/shaderdb/gfx11/TestGdsOperationsForXfb.vert
@@ -0,0 +1,33 @@
+// Test to check GDS operations that are required to support GFX11 transform feedback. Also, check
+// ds_ordered_count is followed by s_waitcnt lgkmcnt(0), which is required by HW on GFX11.
+
+// RUN: amdllpc %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s
+
+// SHADERTEST-LABEL: {{^// LLPC}} final pipeline module info
+// SHADERTEST: .prepareXfb:
+// SHADERTEST: [[orderedWaveId0:%.*]] = inttoptr i32 %orderedWaveId to ptr addrspace(2)
+// SHADERTEST-NEXT: call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) [[orderedWaveId0]], i32 0, i32 0, i32 0, i1 false, i32 16777216, i1 false, i1 false)
+// SHADERTEST: call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %{{.*}}, i32 0)
+// SHADERTEST-NEXT: call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 0, i32 4)
+// SHADERTEST: [[orderedWaveId1:%.*]] = inttoptr i32 %orderedWaveId to ptr addrspace(2)
+// SHADERTEST-NEXT: call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) [[orderedWaveId1]], i32 %{{.*}}, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
+
+// SHADERTEST-LABEL: {{^// LLPC}} final ELF info
+// SHADERTEST: ds_ordered_count {{v[0-9]*}}, {{v[0-9]*}} gds
+// SHADERTEST: s_waitcnt lgkmcnt(0)
+// SHADERTEST: ds_add_gs_reg_rtn {{v[[0-9]*:[0-9]*]}}, {{v[0-9]*}} gds                                                                             ; D9EA0000 03000300
+// SHADERTEST: s_waitcnt lgkmcnt(0)                                                                                         ; BF89FC07
+// SHADERTEST: ds_add_gs_reg_rtn {{v[[0-9]*:[0-9]*]}}, {{v[0-9]*}} offset:4 gds                                                                    ; D9EA0004 04000600
+// SHADERTEST: s_waitcnt lgkmcnt(0)
+// SHADERTEST: ds_ordered_count {{v[0-9]*}}, {{v[0-9]*}} offset:772 gds
+// SHADERTEST: s_waitcnt lgkmcnt(0)
+
+#version 450 core
+
+layout(location = 0, xfb_buffer = 0, xfb_offset = 0, xfb_stride = 16) out vec4 data0;
+layout(location = 1, xfb_buffer = 1, xfb_offset = 0, xfb_stride = 16) out vec4 data1;
+
+void main() {
+  data0 = vec4(0.0);
+  data1 = vec4(1.0);
+}
diff --git a/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe b/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe
index f044fff685..04025d52ee 100644
--- a/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe
+++ b/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe
@@ -2,7 +2,7 @@
 ; qualifier. In a location, only part of its components are exported to XFB buffer and they are not
 ; consecutive.
 
-; RUN: amdllpc -gfxip=11 -stop-after=lgc-patch-copy-shader -v %s | FileCheck -check-prefix=CHECK %s
+; RUN: amdllpc -gfxip=11 -stop-after=lgc-generate-copy-shader -v %s | FileCheck -check-prefix=CHECK %s
 
 ; CHECK-LABEL: @lgc.shader.COPY.main(
 ; CHECK: [[TMP1:%.*]] = call float @lgc.ngg.read.GS.output.f32(i32 0, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/object/ObjOutput_TestGsBuiltIn_lit.geom b/llpc/test/shaderdb/object/ObjOutput_TestGsBuiltIn_lit.geom
index 92590afa53..a1959278ed 100644
--- a/llpc/test/shaderdb/object/ObjOutput_TestGsBuiltIn_lit.geom
+++ b/llpc/test/shaderdb/object/ObjOutput_TestGsBuiltIn_lit.geom
@@ -29,13 +29,13 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
+; SHADERTEST: call void @lgc.output.export.builtin.PrimitiveId{{.*}}
+; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}}
+; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}}
 ; SHADERTEST: call void @lgc.output.export.builtin.Position{{.*}}.v4f32
 ; SHADERTEST: call void @lgc.output.export.builtin.PointSize{{.*}}f32
 ; SHADERTEST: call void @lgc.output.export.builtin.ClipDistance{{.*}}a3f32
 ; SHADERTEST: call void @lgc.output.export.builtin.CullDistance{{.*}}a2f32
-; SHADERTEST: call void @lgc.output.export.builtin.PrimitiveId{{.*}}
-; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}}
-; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}}
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/object/ObjOutput_TestTesBuiltIn_lit.tese b/llpc/test/shaderdb/object/ObjOutput_TestTesBuiltIn_lit.tese
index 8834c49826..5d08ec5f9e 100644
--- a/llpc/test/shaderdb/object/ObjOutput_TestTesBuiltIn_lit.tese
+++ b/llpc/test/shaderdb/object/ObjOutput_TestTesBuiltIn_lit.tese
@@ -21,12 +21,12 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
+; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}}
+; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}}
 ; SHADERTEST: call void @lgc.output.export.builtin.Position{{.*}}v4f32
 ; SHADERTEST: call void @lgc.output.export.builtin.PointSize{{.*}}f32
 ; SHADERTEST: call void @lgc.output.export.builtin.ClipDistance{{.*}}a3f32
 ; SHADERTEST: call void @lgc.output.export.builtin.CullDistance{{.*}}a4f32
-; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}}
-; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}}
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/object/ObjOutput_TestVsBuiltIn_lit.vert b/llpc/test/shaderdb/object/ObjOutput_TestVsBuiltIn_lit.vert
index 160c114043..f2f8d2eaca 100644
--- a/llpc/test/shaderdb/object/ObjOutput_TestVsBuiltIn_lit.vert
+++ b/llpc/test/shaderdb/object/ObjOutput_TestVsBuiltIn_lit.vert
@@ -17,12 +17,12 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
+; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}}
+; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}}
 ; SHADERTEST: call void @lgc.output.export.builtin.Position{{.*}}v4f32
 ; SHADERTEST: call void @lgc.output.export.builtin.PointSize{{.*}}f32
 ; SHADERTEST: call void @lgc.output.export.builtin.ClipDistance{{.*}}a4f32
 ; SHADERTEST: call void @lgc.output.export.builtin.CullDistance{{.*}}a2f32
-; SHADERTEST: call void @lgc.output.export.builtin.Layer{{.*}}
-; SHADERTEST: call void @lgc.output.export.builtin.ViewportIndex{{.*}}
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call void @llvm.amdgcn.exp.f32
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe
index bdf98525d7..fbe7b5a731 100644
--- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe
+++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe
@@ -26,7 +26,7 @@ entryPoint = main
 [FsGlsl]
 #version 450 core
 
-layout(set = 0, binding = 0) uniform sampler2DMS samp;
+layout(set = 0, binding = 2) uniform sampler2DMS samp;
 layout(location = 0) in vec2 inUV;
 layout(location = 0) out vec4 oColor;
 
@@ -62,7 +62,7 @@ userDataNode[0].next[1].type = DescriptorFmask
 userDataNode[0].next[1].offsetInDwords = 12
 userDataNode[0].next[1].sizeInDwords = 8
 userDataNode[0].next[1].set = 0
-userDataNode[0].next[1].binding = 0
+userDataNode[0].next[1].binding = 2
 userDataNode[1].type = IndirectUserDataVaPtr
 userDataNode[1].offsetInDwords = 12
 userDataNode[1].sizeInDwords = 1
diff --git a/llpc/tool/amdllpc.cpp b/llpc/tool/amdllpc.cpp
index ce4f0e3a4f..b80cfa591c 100644
--- a/llpc/tool/amdllpc.cpp
+++ b/llpc/tool/amdllpc.cpp
@@ -370,6 +370,7 @@ extern opt<bool> EnablePipelineDump;
 extern opt<std::string> PipelineDumpDir;
 extern opt<bool> EnableTimerProfile;
 extern opt<bool> BuildShaderCache;
+extern OptionCategory AmdCategory;
 
 } // namespace cl
 } // namespace llvm
@@ -412,10 +413,12 @@ CapabilityPrinter CapPrinterInstance;
 ExtensionPrinter ExtPrinterInstance;
 
 cl::opt<CapabilityPrinter, true, cl::parser<bool>> CapPrinter{"cap", cl::desc("Display the supported Capabilities."),
-                                                              cl::location(CapPrinterInstance), cl::ValueDisallowed};
+                                                              cl::location(CapPrinterInstance), cl::ValueDisallowed,
+                                                              cl::cat(cl::AmdCategory)};
 
 cl::opt<ExtensionPrinter, true, cl::parser<bool>> ExtPrinter{"ext", cl::desc("Display the supported extensions."),
-                                                             cl::location(ExtPrinterInstance), cl::ValueDisallowed};
+                                                             cl::location(ExtPrinterInstance), cl::ValueDisallowed,
+                                                             cl::cat(cl::AmdCategory)};
 } // namespace
 
 // =====================================================================================================================
diff --git a/llpc/tool/llpcCompilationUtils.cpp b/llpc/tool/llpcCompilationUtils.cpp
index ae6dbf4a6e..06c49d5422 100644
--- a/llpc/tool/llpcCompilationUtils.cpp
+++ b/llpc/tool/llpcCompilationUtils.cpp
@@ -58,12 +58,12 @@
 #endif
 
 #include "llpcCompilationUtils.h"
+#include "LoweringUtil.h"
 #include "llpcAutoLayout.h"
 #include "llpcDebug.h"
 #include "llpcError.h"
 #include "llpcInputUtils.h"
 #include "llpcShaderModuleHelper.h"
-#include "llpcSpirvLowerUtil.h"
 #include "llpcThreading.h"
 #include "llpcUtil.h"
 #ifndef LLPC_DISABLE_SPVGEN
@@ -404,9 +404,8 @@ Error processInputPipeline(ICompiler *compiler, CompileInfo &compileInfo, const
     for (auto &libFileName : pipelineState->graphicsLibFileName) {
       if (!libFileName.empty()) {
         LLPC_OUTS(libFileName + "\n");
-        auto inputSpecOrErr = parseInputFileSpec(libFileName);
-        assert(!inputSpecOrErr.takeError());
-        compileInfo.inputSpecs.push_back(std::move(*inputSpecOrErr));
+        InputSpec inputSpec = cantFail(parseInputFileSpec(libFileName));
+        compileInfo.inputSpecs.push_back(std::move(inputSpec));
       }
     }
     return Error::success();
diff --git a/llpc/tool/llpcShaderCache.h b/llpc/tool/llpcShaderCache.h
index fa5ffd126f..dcbe3b6c26 100644
--- a/llpc/tool/llpcShaderCache.h
+++ b/llpc/tool/llpcShaderCache.h
@@ -173,7 +173,7 @@ class IShaderCache {
   IShaderCache() {}
 
   /// @internal Destructor. Prevent use of delete operator on this interface.
-  virtual ~IShaderCache() {}
+  virtual ~IShaderCache() = default;
 };
 #endif
 
diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.cpp b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
index e1b7cd8455..ac0b198805 100644
--- a/llpc/translator/lib/SPIRV/SPIRVReader.cpp
+++ b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
@@ -4902,6 +4902,8 @@ Value *SPIRVToLLVM::transVariableNonImage(SPIRVValue *const spvValue) {
 
   Type *const ptrType = transType(spvVar->getType());
   unsigned addrSpace = ptrType->getPointerAddressSpace();
+  auto llpcContext = static_cast<Llpc::Context *>(m_context);
+  auto buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(llpcContext->getPipelineBuildInfo());
 
   Type *const varType = transType(spvVarType, 0, true, layout);
 
@@ -4927,9 +4929,12 @@ Value *SPIRVToLLVM::transVariableNonImage(SPIRVValue *const spvValue) {
       }
     }
     if (!isBuiltIn) {
-      // Initializize user-defined output variable to zero
+      // Initialize user-defined output variable to zero
       initializer = Constant::getNullValue(varType);
     }
+  } else if (buildInfo->enableInitUndefZero && (storageClass == SPIRVStorageClassKind::StorageClassPrivate ||
+                                                storageClass == SPIRVStorageClassKind::StorageClassFunction)) {
+    initializer = Constant::getNullValue(varType);
   }
 
   bool readOnly = false;
@@ -5262,15 +5267,15 @@ lgc::CooperativeMatrixElementType SPIRVToLLVM::mapToBasicType(SPIRVType *const e
 
 lgc::CooperativeMatrixLayout SPIRVToLLVM::getLayout(lgc::CooperativeMatrixElementType elemType) {
   const Vkgc::GfxIpVersion gfxIp = getPipelineContext()->getGfxIpVersion();
-  if (elemType == lgc::CooperativeMatrixElementType::Int32 || elemType == lgc::CooperativeMatrixElementType::Float32) {
+
+  if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 32)) {
     if (gfxIp.major == 11)
       return lgc::CooperativeMatrixLayout::AccumulatorMatrixLayout;
     return lgc::CooperativeMatrixLayout::Gfx10AccumulatorMatrixLayout;
   }
-  if (elemType == lgc::CooperativeMatrixElementType::Int16 || elemType == lgc::CooperativeMatrixElementType::Int8 ||
-      elemType == lgc::CooperativeMatrixElementType::Float16) {
+  if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 16) || BuilderCommon::isTypeNCooperativeMatrix(elemType, 8))
     return lgc::CooperativeMatrixLayout::FactorMatrixLayout;
-  }
+
   llvm_unreachable("The element type is not supported!");
   return lgc::CooperativeMatrixLayout::InvalidLayout;
 }
@@ -5292,7 +5297,7 @@ lgc::CooperativeMatrixLayout SPIRVToLLVM::getCooperativeMatrixKHRLayout(Cooperat
   if (use == CooperativeMatrixUse::CooperativeMatrixUseMatrixAccumulatorKHR) {
     if (gfxIp.major == 11)
       return lgc::CooperativeMatrixLayout::AccumulatorMatrixLayout;
-    if (elemType == lgc::CooperativeMatrixElementType::Float32 || elemType == lgc::CooperativeMatrixElementType::Int32)
+    if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 32))
       return lgc::CooperativeMatrixLayout::Gfx10AccumulatorMatrixLayout;
     if (elemType == lgc::CooperativeMatrixElementType::Int16 || elemType == lgc::CooperativeMatrixElementType::Float16)
       return lgc::CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout;
@@ -5526,9 +5531,10 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixMulAddKH
   bool isSignedB = static_cast<bool>(static_cast<SPIRVCooperativeMatrixMulAddKHR *>(spvInst)->getMatrixBSigned());
   bool isSat = static_cast<bool>(static_cast<SPIRVCooperativeMatrixMulAddKHR *>(spvInst)->getMatrixSatAccumulation());
 
-  Value *coopMatrixD = getBuilder()->create<CooperativeMatrixMulAddOp>(coopMatrixC->getType(), coopMatrixA, coopMatrixB,
-                                                                       coopMatrixC, isSignedA, isSignedB, isSat, 0,
-                                                                       elemBasicTypeC, elemBasicTypeA, "mulAdd");
+  // Current SPIRV does not supported fp8 or bf8 yet, so the types of A and B use the same value.
+  Value *coopMatrixD = getBuilder()->create<CooperativeMatrixMulAddOp>(
+      coopMatrixC->getType(), coopMatrixA, coopMatrixB, coopMatrixC, isSignedA, isSignedB, isSat, 0, elemBasicTypeA,
+      elemBasicTypeA, elemBasicTypeC, "mulAdd");
   return coopMatrixD;
 }
 
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEntry.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEntry.h
index dbb07263b4..b673630d9c 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEntry.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEntry.h
@@ -185,7 +185,7 @@ class SPIRVEntry {
   SPIRVEntry()
       : Module(NULL), OpCode(OpNop), Id(SPIRVID_INVALID), Attrib(SPIRVEA_DEFAULT), WordCount(0), Line(nullptr) {}
 
-  virtual ~SPIRVEntry() {}
+  virtual ~SPIRVEntry() = default;
 
   bool exist(SPIRVId) const;
   template <class T> T *get(SPIRVId TheId) const { return reinterpret_cast<T *>(getEntry(TheId)); }
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
index df8d706f6e..eeff4e8256 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
@@ -55,9 +55,6 @@ namespace SPIRV {
 SPIRVModule::SPIRVModule() : AutoAddCapability(true), ValidateCapability(false) {
 }
 
-SPIRVModule::~SPIRVModule() {
-}
-
 class SPIRVModuleImpl : public SPIRVModule {
 public:
   SPIRVModuleImpl()
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h
index f1757ee767..3f3a3c6835 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h
@@ -88,7 +88,7 @@ class SPIRVModule {
 
   static SPIRVModule *createSPIRVModule();
   SPIRVModule();
-  virtual ~SPIRVModule();
+  virtual ~SPIRVModule() = default;
 
   // Object query functions
   virtual bool exist(SPIRVId) const = 0;
diff --git a/llvmraytracing/CMakeLists.txt b/llvmraytracing/CMakeLists.txt
index cdc578e061..bc041f02bf 100644
--- a/llvmraytracing/CMakeLists.txt
+++ b/llvmraytracing/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_library(LLVMRaytracing
   lib/LowerRayQuery.cpp
   lib/LowerRaytracingPipeline.cpp
   lib/PassRegistry.inc
+  lib/PipelineState.cpp
   lib/PayloadAccessQualifiers.cpp
   lib/RemoveTypesMetadata.cpp
 
@@ -43,6 +44,7 @@ add_llvm_library(LLVMRaytracing
 
   LINK_COMPONENTS
   Analysis
+  BinaryFormat
   Core
   Coroutines
   IPO
@@ -107,8 +109,3 @@ if(LLVMRAYTRACING_BUILD_TESTS)
   add_custom_target(check-continuations DEPENDS check-llvmraytracing)
   add_custom_target(check-continuations-units DEPENDS check-llvmraytracing-units)
 endif()
-
-# Temporary alias -- to be removed when Vulkan CI and DXCP have been updated.
-if (LLPC_RAYTRACING_ADD_TRANSITION_TARGETS)
-  add_library(LLVMContinuations ALIAS LLVMRaytracing)
-endif()
diff --git a/llvmraytracing/include/continuations/Continuations.h b/llvmraytracing/include/continuations/Continuations.h
deleted file mode 100644
index 1e137767d8..0000000000
--- a/llvmraytracing/include/continuations/Continuations.h
+++ /dev/null
@@ -1,2 +0,0 @@
-// Transition header -- to be removed
-#include "llvmraytracing/Continuations.h"
diff --git a/llvmraytracing/include/continuations/ContinuationsUtil.h b/llvmraytracing/include/continuations/ContinuationsUtil.h
deleted file mode 100644
index c346b5f5ae..0000000000
--- a/llvmraytracing/include/continuations/ContinuationsUtil.h
+++ /dev/null
@@ -1,2 +0,0 @@
-// Transition header -- to be removed
-#include "llvmraytracing/ContinuationsUtil.h"
diff --git a/llvmraytracing/include/llvmraytracing/Continuations.h b/llvmraytracing/include/llvmraytracing/Continuations.h
index d4f3733991..3b94ad0c90 100644
--- a/llvmraytracing/include/llvmraytracing/Continuations.h
+++ b/llvmraytracing/include/llvmraytracing/Continuations.h
@@ -161,31 +161,6 @@ uint64_t computePayloadSpillSize(uint64_t NumI32s, uint64_t NumReservedRegisters
 // of individual bytes at the end if NumBytes is not a multiple of 4.
 void copyBytes(IRBuilder<> &B, Value *Dst, Value *Src, uint64_t NumBytes);
 
-class DialectContextAnalysisResult {
-public:
-  DialectContextAnalysisResult() {}
-
-  bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &, llvm::ModuleAnalysisManager::Invalidator &) {
-    return false;
-  }
-};
-
-/// An analysis to run with dialects, even if the running tool does not have
-/// explicit support for it. This will create a dialect context on-demand.
-class DialectContextAnalysis : public llvm::AnalysisInfoMixin<DialectContextAnalysis> {
-public:
-  using Result = DialectContextAnalysisResult;
-  DialectContextAnalysis(bool NeedDialectContext = true);
-  Result run(llvm::Module &module, llvm::ModuleAnalysisManager &);
-  static llvm::AnalysisKey Key;
-
-private:
-  std::unique_ptr<llvm_dialects::DialectContext> Context;
-  // If true, this analysis is responsible to create a dialect context.
-  // If false, a context is already created outside of the pass pipeline.
-  bool NeedDialectContext;
-};
-
 class LegacyCleanupContinuationsPass : public llvm::PassInfoMixin<LegacyCleanupContinuationsPass> {
 public:
   LegacyCleanupContinuationsPass() {}
@@ -203,37 +178,7 @@ class CleanupContinuationsPass : public llvm::PassInfoMixin<CleanupContinuations
   static llvm::StringRef name() { return "continuation cleanup"; }
 
 private:
-  struct ContinuationData {
-    /// All functions belonging to this continuation, the entry function is the
-    /// first one
-    SmallVector<Function *> Functions;
-    /// Size of the continuation state in byte
-    uint32_t ContStateBytes = 0;
-    CallInst *MallocCall = nullptr;
-    MDNode *MD = nullptr;
-    SmallVector<Function *> NewFunctions;
-  };
-
-  void removeContFreeCall(Function *F, Function *ContFree);
-  Value *getContinuationFramePtr(Function *F, bool IsStart, const ContinuationData &ContinuationInfo,
-                                 SmallVector<Instruction *> *InstsToRemove = nullptr);
-  void freeCpsStack(Function *F, ContinuationData &CpsInfo);
-  void updateCpsStack(Function *F, Function *NewFunc, bool IsStart, ContinuationData &CpsInfo);
-  void analyzeContinuation(Function &F, MDNode *MD);
-  void processContinuations();
-  void handleContinue(ContinuationData &Data, Instruction *Ret);
-  void handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun);
-  void lowerIntrinsicCall(Module &Mod);
-  void lowerGetResumePoint(Module &Mod);
-
-  llvm_dialects::Builder *Builder = nullptr;
-  Function *ContMalloc = nullptr;
-  Function *ContFree = nullptr;
-  MapVector<Function *, ContinuationData> ToProcess;
-  uint32_t MaxContStateBytes;
-  llvm::Module *GpurtLibrary = nullptr;
   bool Use64BitContinuationReferences;
-  llvm::Type *ContinuationReferenceType = nullptr;
 };
 
 // Define a wrapper pass that is used for CleanupContinuationsPass creating
diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
index e05a875c9d..f134257833 100644
--- a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
+++ b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
@@ -125,6 +125,31 @@ struct ContSetting {
   uint64_t Value;
 };
 
+class DialectContextAnalysisResult {
+public:
+  DialectContextAnalysisResult() {}
+
+  bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &, llvm::ModuleAnalysisManager::Invalidator &) {
+    return false;
+  }
+};
+
+/// An analysis to run with dialects, even if the running tool does not have
+/// explicit support for it. This will create a dialect context on-demand.
+class DialectContextAnalysis : public llvm::AnalysisInfoMixin<DialectContextAnalysis> {
+public:
+  using Result = DialectContextAnalysisResult;
+  DialectContextAnalysis(bool NeedDialectContext = true);
+  Result run(llvm::Module &Module, llvm::ModuleAnalysisManager &);
+  static llvm::AnalysisKey Key;
+
+private:
+  std::unique_ptr<llvm_dialects::DialectContext> Context;
+  // If true, this analysis is responsible to create a dialect context.
+  // If false, a context is already created outside of the pass pipeline.
+  bool NeedDialectContext;
+};
+
 // Helper class to access data specific to continuation passes, e.g.
 // metadata or globals.
 class ContHelper {
@@ -162,32 +187,17 @@ class ContHelper {
   // for PAQed fields, and all other data required in a particular stage (e.g.
   // hit attributes).
   //
-  // [in] PreservedPayloadRegisterCount:
-  // The required number of preserved payload registers for functions that
-  // are not aware of payload types (e.g. Intersection or Traversal), if known.
-  // This gives an upper bound on the number of payload registers used by other
-  // functions together with functions in the current module.
-  // Setting this value can be used to reduce the number of preserved registers
-  // for such functions to prevent having to preserve the maximum possible
-  // amount of payload registers. This is used when compiling a specialized
-  // Traversal function for a pipeline after all shaders in the pipeline have
-  // been processed.
-  // For intersection, it is not used, because early-compiled intersection
-  // shaders can be used in pipelines with large payload types unknown when
-  // compiling the intersection shader.
-  static constexpr const char *MDPreservedPayloadRegisterCountName = "continuation.preservedPayloadRegisterCount";
   // [in] MaxPayloadRegisterCount
   // The maximum allowed number of payload registers to be used for payload and
   // other inter-stage date (e.g. attributes). If state does not fit into this
   // limit, we spill to the continuation stack.
   static constexpr const char *MDMaxPayloadRegisterCountName = "continuation.maxPayloadRegisterCount";
-  // [out] MaxUsedPayloadRegisterCount
+  // [in/out] MaxUsedPayloadRegisterCount
   // The maximum number of payload registers written or read by any
-  // shader in the module. This excludes intersection shaders, which
+  // shader in the pipeline. This excludes intersection shaders, which
   // just pass through an existing payload.
-  // This can be used to populate PreservedPayloadRegisterCount when compiling
-  // the driver module in case all modules of the pipeline are known and
-  // have already been processed.
+  // If this is set on a driver module, we rely on it being an upper bound on the
+  // number of payload registers that need to be preserved.
   static constexpr const char *MDMaxUsedPayloadRegisterCountName = "continuation.maxUsedPayloadRegisterCount";
   // The address space used to store the continuations stack.
   // The possible values for this metadata are the values of ContStackAddrspace.
@@ -348,7 +358,11 @@ class ContHelper {
   static std::optional<uint32_t> tryGet##NAME(const Module &M) { return NAME::tryGetValue(&M); }                       \
   static void set##NAME(Module &M, uint32_t Value) { NAME::setValue(&M, Value); }
 
-  MODULE_METADATA_HELPER(PreservedPayloadRegisterCount, MDPreservedPayloadRegisterCountName)
+  static std::optional<uint32_t> tryGetPreservedPayloadRegisterCount(const Module &M) {
+    return tryGetMaxUsedPayloadRegisterCount(M);
+  }
+  static void setPreservedPayloadRegisterCount(Module &M, uint32_t Value) { setMaxUsedPayloadRegisterCount(M, Value); }
+
   MODULE_METADATA_HELPER(MaxUsedPayloadRegisterCount, MDMaxUsedPayloadRegisterCountName)
   MODULE_METADATA_HELPER(MaxPayloadRegisterCount, MDMaxPayloadRegisterCountName)
   MODULE_METADATA_HELPER(Rtip, MDRtipName)
@@ -356,11 +370,6 @@ class ContHelper {
 
 #undef MODULE_METADATA_HELPER
 
-  // Old alias until clients are migrated to setPreservedPayloadRegisterCount:
-  static void setMinPayloadRegisterCount(Module &M, uint32_t PreservedPayloadRegisterCount) {
-    PreservedPayloadRegisterCount::setValue(&M, PreservedPayloadRegisterCount);
-  }
-
   // If there is module-level metadata specifying the stack addrspace,
   // return that value. Otherwise, return std::nullopt.
   static std::optional<ContStackAddrspace> tryGetStackAddrspace(const Module &M) {
diff --git a/llvmraytracing/include/llvmraytracing/LowerRayQuery.h b/llvmraytracing/include/llvmraytracing/LowerRayQuery.h
index bbaba8793b..2b9d3ae344 100644
--- a/llvmraytracing/include/llvmraytracing/LowerRayQuery.h
+++ b/llvmraytracing/include/llvmraytracing/LowerRayQuery.h
@@ -46,6 +46,7 @@ class GpurtGetStaticFlagsOp;
 class GpurtStackReadOp;
 class GpurtStackWriteOp;
 class GpurtLdsStackInitOp;
+class GpurtGetRayStaticIdOp;
 
 namespace rtq {
 class InitializeOp;
@@ -151,6 +152,7 @@ class LowerRayQuery : public llvm::PassInfoMixin<LowerRayQuery> {
   void visitStackReadOp(lgc::GpurtStackReadOp &inst);
   void visitStackWriteOp(lgc::GpurtStackWriteOp &inst);
   void visitLdsStackInitOp(lgc::GpurtLdsStackInitOp &inst);
+  void visitGetRayStaticIdOp(lgc::GpurtGetRayStaticIdOp &inst);
 
   void visitHitAccessor(GpurtFunc instType, llvm::Value *rayQuery, bool committed, llvm::CallBase *inst);
   void visitAccessor(GpurtFunc instType, llvm::Value *rayQuery, llvm::CallBase *inst);
@@ -164,10 +166,12 @@ class LowerRayQuery : public llvm::PassInfoMixin<LowerRayQuery> {
   llvm::Module *m_gpurtModule = nullptr;
   llvm::Function **m_gpurtFuncs = nullptr;
   llvm::SmallVector<llvm::Value *> m_rtqAlloc;
+  llvm::SmallVector<llvm::Instruction *> m_callsToLower;
   llvm::SmallSet<llvm::Function *, 4> m_funcsToLower;
   llvm_dialects::Builder *m_builder = nullptr;
   CompilerUtils::TypeLowering *m_typeLowering = nullptr;
   llvm::Type *m_rtqType = nullptr;
+  unsigned m_traceRayId = 0;
 };
 
 } // namespace rt
diff --git a/llvmraytracing/include/llvmraytracing/PipelineState.h b/llvmraytracing/include/llvmraytracing/PipelineState.h
new file mode 100644
index 0000000000..f00f335ac3
--- /dev/null
+++ b/llvmraytracing/include/llvmraytracing/PipelineState.h
@@ -0,0 +1,93 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  PipelineState.h
+ * @brief Declaration of pipeline state owned by llvmraytracing
+ *
+ * Some optimizations implemented in llvmraytracing depend on cross-module state.
+ * For instance, when compiling the Traversal shader, we need an upper bound on the payload size.
+ *
+ * This class keeps track of any such state that is owned my llvmraytracing, meaning it is produced
+ * and consumed by llvmraytracing passes, and it can be changed without pipeline compiler (e.g. LLPC) changes.
+ *
+ * It supports importing/exporting from/to module metadata, merging with other pipeline states, and
+ * serialization/deserialization to binary blobs via MsgPack.
+ *
+ * It is intended to be used like this by pipeline compilers (such as LLPC):
+ *   * After processing of an app module, its pipeline state is extracted from metadata, and merged with earlier state.
+ *   * Before compiling a module with full pipeline knowledge (e.g. when compiling the Traversal shader), the merged
+ *     state is exported to the module.
+ *   * After having compiled a library/pipeline that might be reused by a child pipeline, its state is serialized.
+ *   * When reusing an early-compiled parent library/pipeline, its state is deserialized and merged into the current
+ *     pipeline's state.
+ *
+ * The pipeline compiler is not expected to collect and merge state of early-compiled driver modules (GpuRt),
+ * as these are compiled independently per pipeline, and thus compilation of child pipeline driver functions shouldn't
+ * depend on parent pipeline driver functions.
+ *
+ ***********************************************************************************************************************
+ */
+#pragma once
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+class Module;
+namespace msgpack {
+class DocNode;
+} // namespace msgpack
+} // namespace llvm
+
+namespace llvmraytracing {
+
+class PipelineState {
+public:
+  // Construct a new trivial pipeline state which can be used to be merged with non-trivial state later.
+  PipelineState() = default;
+
+  // (De)serialization to/from MsgPack is both supported standalone, or as part of an outer MsgPack document.
+  static llvm::Expected<PipelineState> decodeMsgpack(llvm::StringRef Data);
+  // Node is non-const because the const-correct accessors are less convenient to work with
+  static llvm::Expected<PipelineState> decodeMsgpack(llvm::msgpack::DocNode &Node);
+  std::string encodeMsgpack() const;
+  void encodeMsgpack(llvm::msgpack::DocNode &Node) const;
+
+  static llvm::Expected<PipelineState> fromModuleMetadata(const llvm::Module &M);
+  void exportModuleMetadata(llvm::Module &M) const;
+
+  void merge(const PipelineState &Other);
+
+private:
+  // Actual state is intentionally private, as this interface is intended to be used like opaque state.
+  // llvmraytracing passes don't use this interface, and instead directly work on module metadata.
+
+  // The maximum occurring number of payload registers in the pipeline, which will be taken into account for Traversal
+  // module so that it sees the correct maximum payload size of a pipeline.
+  unsigned MaxUsedPayloadRegisterCount = 0;
+};
+
+} // namespace llvmraytracing
diff --git a/llvmraytracing/lib/CleanupContinuations.cpp b/llvmraytracing/lib/CleanupContinuations.cpp
index 93e1f8f66b..0a6639b278 100644
--- a/llvmraytracing/lib/CleanupContinuations.cpp
+++ b/llvmraytracing/lib/CleanupContinuations.cpp
@@ -75,6 +75,52 @@ using namespace lgc;
 
 #define DEBUG_TYPE "cleanup-continuations"
 
+namespace {
+
+class CleanupContinuationsPassImpl {
+public:
+  CleanupContinuationsPassImpl(llvm::Module &M, llvm::ModuleAnalysisManager &AM,
+                               bool Use64BitContinuationReferences = false);
+
+  PreservedAnalyses run();
+
+private:
+  struct ContinuationData {
+    /// All functions belonging to this continuation, the entry function is the
+    /// first one
+    SmallVector<Function *> Functions;
+    /// Size of the continuation state in byte
+    uint32_t ContStateBytes = 0;
+    CallInst *MallocCall = nullptr;
+    MDNode *MD = nullptr;
+    SmallVector<Function *> NewFunctions;
+  };
+
+  void removeContFreeCall(Function *F, Function *ContFree);
+  Value *getContinuationFramePtr(Function *F, bool IsStart, const ContinuationData &ContinuationInfo,
+                                 SmallVector<Instruction *> *InstsToRemove = nullptr);
+  void freeCpsStack(Function *F, ContinuationData &CpsInfo);
+  void updateCpsStack(Function *F, Function *NewFunc, bool IsStart, ContinuationData &CpsInfo);
+  void analyzeContinuation(Function &F, MDNode *MD);
+  void processContinuations();
+  void handleContinue(ContinuationData &Data, Instruction *Ret);
+  void handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun);
+  void lowerIntrinsicCall(Module &Mod);
+  void lowerGetResumePoint(Module &Mod);
+  bool lowerCompleteOp(Module &Mod);
+
+  llvm::Module &Mod;
+  llvm::ModuleAnalysisManager &AnalysisManager;
+  llvm_dialects::Builder *Builder = nullptr;
+  Function *ContMalloc = nullptr;
+  Function *ContFree = nullptr;
+  MapVector<Function *, ContinuationData> ToProcess;
+  uint32_t MaxContStateBytes;
+  llvm::Module *GpurtLibrary = nullptr;
+  bool Use64BitContinuationReferences;
+  llvm::Type *ContinuationReferenceType = nullptr;
+};
+
 /// Find the original call that created the continuation token and the matching
 /// resume function for a return value.
 ///
@@ -139,7 +185,7 @@ findTokenOrigin(BasicBlock *BB, Value *V, SmallVectorImpl<Instruction *> &ToRemo
   return Result;
 }
 
-void CleanupContinuationsPass::analyzeContinuation(Function &F, MDNode *MD) {
+void CleanupContinuationsPassImpl::analyzeContinuation(Function &F, MDNode *MD) {
   // Only analyze main continuation
   auto *MDTup = cast<MDTuple>(MD);
   auto *EntryF = mdconst::extract<Function>(MDTup->getOperand(0));
@@ -173,7 +219,8 @@ void CleanupContinuationsPass::analyzeContinuation(Function &F, MDNode *MD) {
     MaxContStateBytes = Data.ContStateBytes;
 }
 
-void CleanupContinuationsPass::updateCpsStack(Function *F, Function *NewFunc, bool IsStart, ContinuationData &CpsInfo) {
+void CleanupContinuationsPassImpl::updateCpsStack(Function *F, Function *NewFunc, bool IsStart,
+                                                  ContinuationData &CpsInfo) {
 
   Builder->SetInsertPoint(&*NewFunc->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
   Value *CpsStack = nullptr;
@@ -247,9 +294,9 @@ static void buildCpsArgInfos(Function *F, bool IsStart, SmallVector<Type *> &All
 
 /// Find the continuation state pointer, either returned by the malloc or
 /// given as an argument
-Value *CleanupContinuationsPass::getContinuationFramePtr(Function *F, bool IsStart,
-                                                         const ContinuationData &ContinuationInfo,
-                                                         SmallVector<Instruction *> *InstsToRemove) {
+Value *CleanupContinuationsPassImpl::getContinuationFramePtr(Function *F, bool IsStart,
+                                                             const ContinuationData &ContinuationInfo,
+                                                             SmallVector<Instruction *> *InstsToRemove) {
   if (!ContinuationInfo.MallocCall)
     return IsStart ? F->getArg(F->arg_size() - 1) : F->getArg(0);
 
@@ -267,7 +314,7 @@ Value *CleanupContinuationsPass::getContinuationFramePtr(Function *F, bool IsSta
 
 /// Remove call to continuation.free() in F, ContFree is the pointer to
 /// declaration of continuation.free().
-void CleanupContinuationsPass::removeContFreeCall(Function *F, Function *ContFree) {
+void CleanupContinuationsPassImpl::removeContFreeCall(Function *F, Function *ContFree) {
   for (auto *User : make_early_inc_range(ContFree->users())) {
     if (auto *Call = dyn_cast<CallInst>(User)) {
       if (Call->getFunction() == F) {
@@ -279,7 +326,7 @@ void CleanupContinuationsPass::removeContFreeCall(Function *F, Function *ContFre
 }
 
 /// Insert cps.free() before the original function exits and lgc.cps.complete calls.
-void CleanupContinuationsPass::freeCpsStack(Function *F, ContinuationData &CpsInfo) {
+void CleanupContinuationsPassImpl::freeCpsStack(Function *F, ContinuationData &CpsInfo) {
   struct VisitState {
     ContinuationData &CpsInfo;
     llvm_dialects::Builder *Builder;
@@ -288,7 +335,6 @@ void CleanupContinuationsPass::freeCpsStack(Function *F, ContinuationData &CpsIn
   VisitState State = {CpsInfo, Builder, F};
   static const auto Visitor =
       llvm_dialects::VisitorBuilder<VisitState>()
-          .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
           .addSet<cps::JumpOp, cps::CompleteOp>([](auto &State, auto &Instruction) {
             if (Instruction.getFunction() == State.F && State.CpsInfo.ContStateBytes) {
               State.Builder->SetInsertPoint(&Instruction);
@@ -299,7 +345,31 @@ void CleanupContinuationsPass::freeCpsStack(Function *F, ContinuationData &CpsIn
   Visitor.visit(State, *F);
 }
 
-void CleanupContinuationsPass::processContinuations() {
+/// Handle lgc.cps.complete calls.
+bool CleanupContinuationsPassImpl::lowerCompleteOp(Module &Mod) {
+  struct VisitState {
+    llvm_dialects::Builder *Builder;
+    bool completeLowered;
+  };
+
+  bool completeLowered = false;
+  VisitState State = {Builder, completeLowered};
+  static auto Visitor = llvm_dialects::VisitorBuilder<VisitState>()
+                            .add<cps::CompleteOp>([](VisitState &State, auto &complete) {
+                              State.Builder->SetInsertPoint(&complete);
+                              State.Builder->CreateRetVoid();
+                              BasicBlock *block = complete.getParent();
+                              block->getTerminator()->eraseFromParent();
+                              complete.eraseFromParent();
+                              State.completeLowered = true;
+                            })
+                            .build();
+
+  Visitor.visit(State, Mod);
+  return State.completeLowered;
+}
+
+void CleanupContinuationsPassImpl::processContinuations() {
   // Summarize of what to do here:
   // 1. Continuation Stack
   //    a.) cps.alloc() in start, and cps.peek() cps.free() in resume.
@@ -362,17 +432,6 @@ void CleanupContinuationsPass::processContinuations() {
         auto *I = BB.getTerminator();
         if (isa<ReturnInst>(I)) {
           handleContinue(FuncData.second, I);
-        } else if (I->getOpcode() == Instruction::Unreachable) {
-          // We should only have 'lgc.cps.complete' or 'lgc.cps.jump' calls before unreachable.
-          auto *Call = cast<CallInst>(--I->getIterator());
-          if (isa<cps::CompleteOp>(Call)) {
-            Builder->SetInsertPoint(Call);
-            Builder->CreateRetVoid();
-            Call->eraseFromParent();
-            I->eraseFromParent();
-          } else {
-            assert(isa<cps::JumpOp>(Call));
-          }
         }
       }
 
@@ -413,7 +472,7 @@ void CleanupContinuationsPass::processContinuations() {
 ///                                 i32 %cr2, ...)
 ///
 /// Also handles cases where the token and resume function are behind a phi.
-void CleanupContinuationsPass::handleContinue(ContinuationData &Data, Instruction *Ret) {
+void CleanupContinuationsPassImpl::handleContinue(ContinuationData &Data, Instruction *Ret) {
   // Find the function call that generates the token
   LLVM_DEBUG(dbgs() << "Converting ret to continue: " << *Ret << "\nArgument: " << *Ret->getOperand(0) << "\n");
   auto *BB = Ret->getParent();
@@ -438,7 +497,7 @@ void CleanupContinuationsPass::handleContinue(ContinuationData &Data, Instructio
   }
 }
 
-void CleanupContinuationsPass::handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun) {
+void CleanupContinuationsPassImpl::handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun) {
   Builder->SetInsertPoint(Call);
 
   SmallVector<Value *> TailArgs;
@@ -478,7 +537,7 @@ void CleanupContinuationsPass::handleSingleContinue(ContinuationData &Data, Call
 }
 
 /// Lower lgc.rt calls inside cps functions.
-void CleanupContinuationsPass::lowerIntrinsicCall(Module &Mod) {
+void CleanupContinuationsPassImpl::lowerIntrinsicCall(Module &Mod) {
   DenseMap<Function *, SmallVector<CallInst *>> CpsIntrinsicCalls;
 
   // We only care about lgc.rt here.
@@ -520,7 +579,7 @@ void CleanupContinuationsPass::lowerIntrinsicCall(Module &Mod) {
   }
 }
 
-void CleanupContinuationsPass::lowerGetResumePoint(Module &Mod) {
+void CleanupContinuationsPassImpl::lowerGetResumePoint(Module &Mod) {
   for (auto &F : make_early_inc_range(Mod)) {
     auto FuncName = F.getName();
     if (!FuncName.starts_with("_AmdGetResumePointAddr"))
@@ -544,7 +603,12 @@ void CleanupContinuationsPass::lowerGetResumePoint(Module &Mod) {
   }
 }
 
-llvm::PreservedAnalyses CleanupContinuationsPass::run(llvm::Module &Mod, llvm::ModuleAnalysisManager &AnalysisManager) {
+CleanupContinuationsPassImpl::CleanupContinuationsPassImpl(llvm::Module &M, llvm::ModuleAnalysisManager &AM,
+                                                           bool Use64BitContinuationReferences)
+    : Mod(M), AnalysisManager(AM), Use64BitContinuationReferences{Use64BitContinuationReferences} {
+}
+
+llvm::PreservedAnalyses CleanupContinuationsPassImpl::run() {
   LLVM_DEBUG(dbgs() << "Run the lgc-cleanup-continuations pass\n");
   AnalysisManager.getResult<DialectContextAnalysis>(Mod);
   auto &FAM = AnalysisManager.getResult<FunctionAnalysisManagerModuleProxy>(Mod).getManager();
@@ -614,13 +678,26 @@ llvm::PreservedAnalyses CleanupContinuationsPass::run(llvm::Module &Mod, llvm::M
     }
   }
 
+  bool Changed = false;
   if (!ToProcess.empty()) {
     processContinuations();
     // Lower lgc.rt intrinsics
     lowerIntrinsicCall(Mod);
 
     lowerGetResumePoint(Mod);
-    return PreservedAnalyses::none();
+    Changed = true;
   }
-  return PreservedAnalyses::all();
+
+  Changed |= lowerCompleteOp(Mod);
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+} // namespace
+
+llvm::PreservedAnalyses CleanupContinuationsPass::run(llvm::Module &Mod, llvm::ModuleAnalysisManager &AnalysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the cleanup-continuations pass\n");
+  AnalysisManager.getResult<DialectContextAnalysis>(Mod);
+  CleanupContinuationsPassImpl Impl(Mod, AnalysisManager, Use64BitContinuationReferences);
+  return Impl.run();
 }
diff --git a/llvmraytracing/lib/Continuations.cpp b/llvmraytracing/lib/Continuations.cpp
index 29c616181d..a9bb95cc3a 100644
--- a/llvmraytracing/lib/Continuations.cpp
+++ b/llvmraytracing/lib/Continuations.cpp
@@ -832,6 +832,12 @@ CallInst *llvm::replaceIntrinsicCall(IRBuilder<> &B, Type *SystemDataTy, Value *
     }
   }
 
+  // Tolerate Replacement returning a single-element struct containing a value of the right type.
+  if (!Call->getType()->isVoidTy() && Call->getType() != Replacement->getType()) {
+    assert(cast<StructType>(Replacement->getType())->getNumElements() == 1);
+    Replacement = B.CreateExtractValue(Replacement, 0);
+  }
+
   LLVM_DEBUG(dbgs() << "Replacing " << *Call << " by " << *NewCall << "\n");
   if (!Call->getType()->isVoidTy())
     Call->replaceAllUsesWith(Replacement);
diff --git a/llvmraytracing/lib/LegacyCleanupContinuations.cpp b/llvmraytracing/lib/LegacyCleanupContinuations.cpp
index db36d52c62..6a6c2b921b 100644
--- a/llvmraytracing/lib/LegacyCleanupContinuations.cpp
+++ b/llvmraytracing/lib/LegacyCleanupContinuations.cpp
@@ -41,6 +41,7 @@
 #include "lgc/LgcIlCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
 #include "llvm-dialects/Dialect/Builder.h"
+#include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Instructions.h"
@@ -88,6 +89,7 @@ class LegacyCleanupContinuationsPassImpl {
   void handleFunctionEntry(ContinuationData &Data, Function *F, bool IsEntry);
   void handleContinue(ContinuationData &Data, Instruction *Ret);
   void handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun);
+  bool lowerCompleteOp(Module &M);
 
   Module &M;
   LLVMContext &Context;
@@ -295,6 +297,26 @@ Value *getContFrame(CallInst *MallocCall, Function *F, bool IsStart, SmallVector
   return ContFrame;
 }
 
+bool LegacyCleanupContinuationsPassImpl::lowerCompleteOp(Module &M) {
+  struct VisitState {
+    llvm_dialects::Builder &Builder;
+    bool completeLowered;
+  };
+
+  bool completeLowered = false;
+  VisitState State = {B, completeLowered};
+  static auto Visitor = llvm_dialects::VisitorBuilder<VisitState>()
+                            .add<lgc::cps::CompleteOp>([](VisitState &State, auto &complete) {
+                              State.Builder.SetInsertPoint(&complete);
+                              llvm::terminateShader(State.Builder, &complete);
+                              State.completeLowered = true;
+                            })
+                            .build();
+
+  Visitor.visit(State, M);
+  return State.completeLowered;
+}
+
 void LegacyCleanupContinuationsPassImpl::processContinuation(Function *StartFunc, ContinuationData &FuncData) {
   auto *Void = Type::getVoidTy(Context);
   LLVM_DEBUG(dbgs() << "Processing function: " << StartFunc->getName() << "\n");
@@ -471,8 +493,6 @@ void LegacyCleanupContinuationsPassImpl::processContinuation(Function *StartFunc
           uint32_t NeededStackSize = FuncData.getContStateStackBytes();
           if (NeededStackSize > 0)
             B.create<lgc::cps::FreeOp>(B.getInt32(NeededStackSize));
-
-          llvm::terminateShader(B, PrevInst);
         } else {
           LLVM_DEBUG(PrevInst->dump());
           llvm_unreachable("Unexpected instruction!");
@@ -670,6 +690,8 @@ PreservedAnalyses LegacyCleanupContinuationsPassImpl::run() {
     fixupDxilMetadata(M);
   }
 
+  Changed |= lowerCompleteOp(M);
+
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
diff --git a/llvmraytracing/lib/LgcCpsJumpInliner.cpp b/llvmraytracing/lib/LgcCpsJumpInliner.cpp
index e970060385..3890117b02 100644
--- a/llvmraytracing/lib/LgcCpsJumpInliner.cpp
+++ b/llvmraytracing/lib/LgcCpsJumpInliner.cpp
@@ -69,16 +69,16 @@ LgcCpsJumpInlinerPassImpl::LgcCpsJumpInlinerPassImpl(Module &M, Module &GpurtLib
 
 PreservedAnalyses LgcCpsJumpInlinerPassImpl::run() {
   using JumpVecTy = SmallVector<JumpOp *>;
-  static const auto Visitor =
-      llvm_dialects::VisitorBuilder<SmallVector<JumpOp *>>()
-          .add<JumpOp>([](SmallVector<JumpOp *> &AllJumps, JumpOp &Jump) { AllJumps.push_back(&Jump); })
-          .build();
+  static const auto Visitor = llvm_dialects::VisitorBuilder<SmallVector<JumpOp *>>()
+                                  .add<JumpOp>([](JumpVecTy &AllJumps, JumpOp &Jump) { AllJumps.push_back(&Jump); })
+                                  .build();
 
   JumpVecTy AllJumps;
   // Collect lgc.cps.jump ops.
   Visitor.visit(AllJumps, *Mod);
 
   bool Changed = false;
+  DenseSet<Function *> DeadFunctions;
   // Iterate over all collected jumps and try to inline the jump target.
   for (auto *Jump : AllJumps) {
     auto *AsCROp = dyn_cast<AsContinuationReferenceOp>(Jump->getTarget());
@@ -113,12 +113,16 @@ PreservedAnalyses LgcCpsJumpInlinerPassImpl::run() {
       AsCROp->eraseFromParent();
 
     // There might still be other users left, if the function is not referenced as direct jump target.
+    // Remove function after this loop, it may contain jumps that we still want to inline.
     if (JumpTargetFunc->user_empty() && JumpTargetFunc->getLinkage() == GlobalValue::InternalLinkage)
-      JumpTargetFunc->eraseFromParent();
+      DeadFunctions.insert(JumpTargetFunc);
 
     Changed = true;
   }
 
+  for (auto *F : DeadFunctions)
+    F->eraseFromParent();
+
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
diff --git a/llvmraytracing/lib/LowerRayQuery.cpp b/llvmraytracing/lib/LowerRayQuery.cpp
index 9df3b4b6f7..f94e3cc0df 100644
--- a/llvmraytracing/lib/LowerRayQuery.cpp
+++ b/llvmraytracing/lib/LowerRayQuery.cpp
@@ -53,24 +53,24 @@ static const char *const GpurtFuncNames[] = {
     "_RayQuery_CommitNonOpaqueTriangleHit",
     "_RayQuery_CommitProceduralPrimitiveHit",
     "_RayQuery_EndInterleavedProceed",
-    "FetchTrianglePositionFromRayQuery",
+    "_RayQuery_FetchTrianglePosition",
     "_RayQuery_GeometryIndex",
     "_RayQuery_GetObjId",
     "_RayQuery_InstanceContributionToHitGroupIndex",
     "_RayQuery_InstanceID",
     "_RayQuery_InstanceIndex",
     "_RayQuery_IntersectionType",
-    "LongRayQueryProceedAMD",
+    "_RayQuery_LongProceedAMD",
     "_RayQuery_ObjectRayDirection",
     "_RayQuery_ObjectRayOrigin",
     "_RayQuery_ObjectToWorld4x3",
     "_RayQuery_PrimitiveIndex",
     "_RayQuery_RayFlags",
-    "RayQueryProceed",
+    "_RayQuery_Proceed",
     "_RayQuery_RayT",
     "_RayQuery_RayTMin",
     "_RayQuery_SetObjId",
-    "TraceRayInline",
+    "_RayQuery_TraceRayInline",
     "_RayQuery_TriangleBarycentrics",
     "_RayQuery_TriangleFrontFace",
     "_RayQuery_WorldRayDirection",
@@ -604,6 +604,17 @@ void LowerRayQuery::visitLdsStackInitOp(GpurtLdsStackInitOp &inst) {
     inst.setUseExtraStack(true);
 }
 
+// =====================================================================================================================
+// Visits "lgc.gpurt.get.ray.static.id" instructions
+//
+// @param inst : The instruction
+void LowerRayQuery::visitGetRayStaticIdOp(GpurtGetRayStaticIdOp &inst) {
+  auto hashcode = hash_combine(m_traceRayId++, inst.getFunction()->getName());
+  inst.replaceAllUsesWith(m_builder->getInt32(hashcode));
+  m_callsToLower.push_back(&inst);
+  m_funcsToLower.insert(inst.getCalledFunction());
+}
+
 // =====================================================================================================================
 // Executes this LowerRayquery pass on the specified LLVM module.
 //
@@ -622,7 +633,6 @@ PreservedAnalyses LowerRayQuery::run(Module &module, ModuleAnalysisManager &anal
 
   static auto findRayqueryDialect =
       llvm_dialects::VisitorBuilder<FuncSet>()
-          .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
           .add<rtq::InitializeOp>([](FuncSet &funcSet, auto &inst) { funcSet.insert(inst.getFunction()); })
           .build();
   findRayqueryDialect.visit(rayQueryFuncs, module);
@@ -679,21 +689,29 @@ PreservedAnalyses LowerRayQuery::run(Module &module, ModuleAnalysisManager &anal
 
   payload.typeLower.finishPhis();
   payload.typeLower.finishCleanup();
-
   static auto postVisit = llvm_dialects::VisitorBuilder<LowerRayQuery>()
-                              .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
                               .add(&LowerRayQuery::visitGetStaticFlagsOp)
                               .add(&LowerRayQuery::visitStackReadOp)
                               .add(&LowerRayQuery::visitStackWriteOp)
                               .add(&LowerRayQuery::visitLdsStackInitOp)
+                              .add(&LowerRayQuery::visitGetRayStaticIdOp)
                               .build();
   postVisit.visit(*this, module);
 
   m_typeLowering = nullptr;
+
+  for (Instruction *call : m_callsToLower) {
+    call->dropAllReferences();
+    call->eraseFromParent();
+  }
+  m_callsToLower.clear();
+
   for (Function *func : m_funcsToLower) {
     func->dropAllReferences();
     func->eraseFromParent();
   }
+  m_funcsToLower.clear();
+
   return PreservedAnalyses::none();
 }
 
diff --git a/llvmraytracing/lib/LowerRaytracingPipeline.cpp b/llvmraytracing/lib/LowerRaytracingPipeline.cpp
index f5d0cded8c..ac041a24ca 100644
--- a/llvmraytracing/lib/LowerRaytracingPipeline.cpp
+++ b/llvmraytracing/lib/LowerRaytracingPipeline.cpp
@@ -228,14 +228,18 @@ class ModuleMetadataState final {
 
   uint32_t getMaxPayloadRegisterCount() const { return MaxPayloadRegisterCount; }
 
-  std::optional<uint32_t> tryGetPreservedPayloadRegisterCount() const { return PreservedPayloadRegisterCount; }
-
   void updateMaxUsedPayloadRegisterCount(uint32_t Count) {
     MaxUsedPayloadRegisterCount = std::max(Count, MaxUsedPayloadRegisterCount);
   }
 
   uint32_t getMaxUsedPayloadRegisterCount() const { return MaxUsedPayloadRegisterCount; }
 
+  // Returns whether a value for maxUsedPayloadRegisterCount was set in the input module.
+  // If that is the case, for driver functions we rely on it.
+  // This mechanism ensures we don't rely on it in case the value was only initialized
+  // during processing of the current module.
+  bool maxUsedPayloadRegisterCountWasSet() const { return MaxUsedPayloadRegisterCountWasSet; }
+
   uint32_t getMaxHitAttributeByteCount() const { return MaxHitAttributeByteCount; }
 
   bool isInLgcCpsMode() const { return IsInLgcCpsMode; }
@@ -250,12 +254,13 @@ class ModuleMetadataState final {
   /// [In]: Maximum allowed number of registers to be used for the payload.
   ///       It is guaranteed that all modules in a pipeline share this value.
   uint32_t MaxPayloadRegisterCount = 0;
-  /// [In]: If known, the number of payload registers that need to be preserved
-  ///       by functions that don't know the payload type, e.g. Traversal.
-  std::optional<uint32_t> PreservedPayloadRegisterCount = {};
-  /// [Out]: The maximum number of payload registers written or read by any
-  ///        shader in the module. This excludes intersection shaders, which
-  ///        just pass through an existing payload.
+  /// [In/Out]: The maximum number of payload registers written or read by any
+  ///           shader in the pipeline observed so far.
+  ///           This excludes intersection shaders, which just pass through an existing payload.
+  ///           If set on an incoming module, we can rely on it being an upper bound
+  ///           for driver functions, because driver functions are compiled last and not
+  ///           reused for child pipelines.
+  ///           We can't rely on it when compiling app shaders (e.g. intersection).
   uint32_t MaxUsedPayloadRegisterCount = 0;
   /// [In]: The maximum size of hit attribute stored on the module as metadata.
   uint32_t MaxHitAttributeByteCount = 0;
@@ -265,6 +270,8 @@ class ModuleMetadataState final {
 
   /// If the module has lgc.cps.module metadata attached.
   bool IsInLgcCpsMode = false;
+
+  bool MaxUsedPayloadRegisterCountWasSet = false;
 };
 
 class LowerRaytracingPipelinePassImpl final {
@@ -611,14 +618,9 @@ ModuleMetadataState::ModuleMetadataState(Module &Module) : Mod{Module} {
   auto RegisterCountFromMD = ContHelper::MaxPayloadRegisterCount::tryGetValue(&Module);
   MaxPayloadRegisterCount = RegisterCountFromMD.value_or(DefaultPayloadRegisterCount);
 
-  // Check that if there is a required minimum number of payload registers,
-  // it is compatible
-  PreservedPayloadRegisterCount = ContHelper::PreservedPayloadRegisterCount::tryGetValue(&Module);
-  assert(PreservedPayloadRegisterCount.value_or(MaxPayloadRegisterCount) <= MaxPayloadRegisterCount);
-
-  MaxUsedPayloadRegisterCount = ContHelper::MaxUsedPayloadRegisterCount::tryGetValue(&Module).value_or(0);
-  if (PreservedPayloadRegisterCount.has_value())
-    MaxUsedPayloadRegisterCount = std::max(MaxUsedPayloadRegisterCount, PreservedPayloadRegisterCount.value());
+  auto OptMaxUsedPayloadRegisterCount = ContHelper::MaxUsedPayloadRegisterCount::tryGetValue(&Module);
+  MaxUsedPayloadRegisterCount = OptMaxUsedPayloadRegisterCount.value_or(0);
+  MaxUsedPayloadRegisterCountWasSet = OptMaxUsedPayloadRegisterCount.has_value();
 
   // Use max hit attribute size from metadata, or use globally max allowed
   // value for the max if metadata is not set
@@ -961,6 +963,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
       RetAddr = PoisonValue::get(Builder.getInt64Ty());
     } else {
       RetAddr = Call->getArgOperand(RetAddrArgIndex);
+      assert(RetAddr->getType()->isIntegerTy(32) || RetAddr->getType()->isIntegerTy(64));
       ++RetAddrArgIndex;
     }
 
@@ -1430,9 +1433,9 @@ void LowerRaytracingPipelinePassImpl::setGpurtEntryRegisterCountMetadata() {
   // Even if PreservedPayloadRegisterCount is set, there may be
   // additional shaders in the current module whose usage is recorded
   // in MaxUsedPayloadRegisterCount, to take the max with it.
-  uint32_t MaxRegisterCount =
-      std::max(MetadataState.tryGetPreservedPayloadRegisterCount().value_or(MetadataState.getMaxPayloadRegisterCount()),
-               MetadataState.getMaxUsedPayloadRegisterCount());
+  uint32_t MaxRegisterCount = MetadataState.maxUsedPayloadRegisterCountWasSet()
+                                  ? MetadataState.getMaxUsedPayloadRegisterCount()
+                                  : MetadataState.getMaxPayloadRegisterCount();
 
   struct VisitorState {
     ModuleMetadataState &Metadata;
@@ -1759,10 +1762,9 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
     AllArgTypes.push_back(SystemDataTy);
     NewRetTy = SystemDataTy;
 
-    // We should have set up preserved register count for Traversal, if not,
-    // fall back to max count.
-    Data.NumPassedThroughPayloadDwords =
-        MetadataState.tryGetPreservedPayloadRegisterCount().value_or(MetadataState.getMaxPayloadRegisterCount());
+    Data.NumPassedThroughPayloadDwords = MetadataState.maxUsedPayloadRegisterCountWasSet()
+                                             ? MetadataState.getMaxUsedPayloadRegisterCount()
+                                             : MetadataState.getMaxPayloadRegisterCount();
     break;
   }
   default:
@@ -2385,7 +2387,6 @@ PreservedAnalyses LowerRaytracingPipelinePassImpl::run() {
 
   static const auto Visitor =
       llvm_dialects::VisitorBuilder<VisitorState>()
-          .setStrategy(llvm_dialects::VisitorStrategy::ByInstruction)
           .addSet<TraceRayOp, CallCallableShaderOp, ReportHitOp, ShaderIndexOp, ShaderRecordBufferOp, JumpOp>(
               [](VisitorState &State, Instruction &Op) {
                 auto *CInst = cast<CallInst>(&Op);
diff --git a/llvmraytracing/lib/PipelineState.cpp b/llvmraytracing/lib/PipelineState.cpp
new file mode 100644
index 0000000000..ffbda3b41d
--- /dev/null
+++ b/llvmraytracing/lib/PipelineState.cpp
@@ -0,0 +1,116 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  PipelineState.cpp
+ * @brief Implementation of helpers for llvmraytracing pipeline state.
+ ***********************************************************************************************************************
+ */
+
+#include "llvmraytracing/PipelineState.h"
+#include "llvmraytracing/ContinuationsUtil.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
+
+using namespace llvm;
+
+namespace {
+// Constants used in the msgpack format
+namespace MsgPackFormat {
+
+constexpr unsigned MajorVersion = 1;
+
+static constexpr char Version[] = "version";
+static constexpr char MaxUsedPayloadRegisterCount[] = "max_used_payload_register_count";
+
+} // namespace MsgPackFormat
+} // anonymous namespace
+
+namespace llvmraytracing {
+
+Expected<PipelineState> PipelineState::decodeMsgpack(llvm::msgpack::DocNode &Root) {
+  auto &Node = Root.getMap();
+
+  auto GetUInt = [](msgpack::DocNode &Node, auto &Out) {
+    if (!Node.isEmpty())
+      Out = Node.getUInt();
+  };
+
+  uint64_t Version = 0;
+  GetUInt(Node[MsgPackFormat::Version], Version);
+  if (Version != MsgPackFormat::MajorVersion)
+    return make_error<StringError>("bad/missing llvmraytracing pipelinestate version", inconvertibleErrorCode());
+
+  PipelineState State = {};
+  GetUInt(Node[MsgPackFormat::MaxUsedPayloadRegisterCount], State.MaxUsedPayloadRegisterCount);
+
+  return State;
+}
+
+Expected<PipelineState> PipelineState::decodeMsgpack(StringRef Data) {
+  msgpack::Document Doc;
+
+  if (!Doc.readFromBlob(Data, false))
+    return make_error<StringError>("failed to parse msgpack", inconvertibleErrorCode());
+
+  auto &Root = Doc.getRoot().getMap();
+  return decodeMsgpack(Root);
+}
+
+void PipelineState::encodeMsgpack(llvm::msgpack::DocNode &Root) const {
+  auto &Node = Root.getMap(true);
+  Node[MsgPackFormat::Version] = MsgPackFormat::MajorVersion;
+  Node[MsgPackFormat::MaxUsedPayloadRegisterCount] = MaxUsedPayloadRegisterCount;
+}
+
+std::string PipelineState::encodeMsgpack() const {
+  msgpack::Document Doc;
+
+  auto &Root = Doc.getRoot().getMap(true);
+  encodeMsgpack(Root);
+
+  std::string Out;
+  Doc.writeToBlob(Out);
+  return Out;
+}
+
+llvm::Expected<PipelineState> PipelineState::fromModuleMetadata(const llvm::Module &M) {
+  PipelineState State = {};
+  auto OptMaxUsedPayloadRegCount = ContHelper::tryGetMaxUsedPayloadRegisterCount(M);
+  if (OptMaxUsedPayloadRegCount.has_value())
+    State.MaxUsedPayloadRegisterCount = *OptMaxUsedPayloadRegCount;
+  return State;
+}
+
+void PipelineState::exportModuleMetadata(llvm::Module &M) const {
+  if (MaxUsedPayloadRegisterCount) {
+    ContHelper::setMaxUsedPayloadRegisterCount(M, MaxUsedPayloadRegisterCount);
+  }
+}
+
+void PipelineState::merge(const PipelineState &Other) {
+  MaxUsedPayloadRegisterCount = std::max(MaxUsedPayloadRegisterCount, Other.MaxUsedPayloadRegisterCount);
+}
+
+} // namespace llvmraytracing
diff --git a/llvmraytracing/test/dx/continuation-registercount.ll b/llvmraytracing/test/dx/continuation-registercount.ll
index 5d0bdefd4e..1e7418dd21 100644
--- a/llvmraytracing/test/dx/continuation-registercount.ll
+++ b/llvmraytracing/test/dx/continuation-registercount.ll
@@ -39,10 +39,10 @@ declare i32 @_cont_GetContinuationStackAddr() #0
 declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0
 
 ; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) #0
 
 ; Function Attrs: alwaysinline
-declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #0
+declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, i64, %struct.AnyHitTraversalData) #0
 
 ; Function Attrs: nounwind memory(read)
 declare !pointeetys !24 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %struct.HitData*) #1
@@ -102,7 +102,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i
 ; Function Attrs: alwaysinline
 define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !pointeetys !37 {
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data)
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
@@ -111,7 +111,7 @@ define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !poi
 ; Function Attrs: alwaysinline
 define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !38 {
   %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4
-  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind)
+  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, i64 poison, %struct.AnyHitTraversalData %trav_data)
   store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4
   call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data)
   ret i1 true
@@ -155,12 +155,16 @@ define void @called(%struct.MyParams* %arg) !pointeetys !39 {
 }
 
 ; MAX10-DAG: Incoming payload VGPR size of "Intersection" (intersection): 10 dwords
-; MAX10-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 10 dwords
+; MAX30-DAG: Incoming payload VGPR size of "Intersection" (intersection): 30 dwords
 ; COMMON-DAG: Outgoing payload VGPR size by jump:
-; MAX10-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}: 10 dwords
+; MAX10-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}}: 10 dwords
+; MAX30-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}}: 30 dwords
+
+; MAX10-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 10 dwords
 ; MAX30-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 30 dwords
 ; COMMON-DAG: Outgoing payload VGPR size by jump:
-; MAX30-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}: 30 dwords
+; MAX10-DAG: call void (...) @lgc.cps.jump(i64 %returnAddr.reload{{.*}}: 10 dwords
+; MAX30-DAG: call void (...) @lgc.cps.jump(i64 %returnAddr.reload{{.*}}: 30 dwords
 
 define void @Intersection() #3 {
   %a = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
@@ -231,8 +235,8 @@ attributes #3 = { nounwind }
 !dx.entryPoints = !{!3, !6, !13, !15, !17, !19, !21, !57}
 !continuation.maxPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10
 !continuation.maxPayloadRegisterCount = !{!53} ; 30; only for MAX_REG_30
-!continuation.preservedPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10
-!continuation.preservedPayloadRegisterCount = !{!54} ; 27; only for MAX_REG_30
+!continuation.maxUsedPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10
+!continuation.maxUsedPayloadRegisterCount = !{!54} ; 27; only for MAX_REG_30
 !lgc.rt.max.attribute.size = !{!60}
 
 !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
diff --git a/llvmraytracing/test/dx/continuation-stacksize.ll b/llvmraytracing/test/dx/continuation-stacksize.ll
index 590090f7f2..37861b2f0a 100644
--- a/llvmraytracing/test/dx/continuation-stacksize.ll
+++ b/llvmraytracing/test/dx/continuation-stacksize.ll
@@ -31,7 +31,7 @@ declare !pointeetys !33 i1 @_cont_ReportHit(%struct.TraversalData* %data, float
 declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0
 
 ; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) #0
 
 ; Function Attrs: alwaysinline
 declare !pointeetys !17 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
@@ -61,7 +61,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i
 ; Function Attrs: alwaysinline
 define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !pointeetys !23 {
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data)
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
@@ -147,6 +147,7 @@ attributes #3 = { nounwind memory(read) }
 !dx.valver = !{!1}
 !dx.shaderModel = !{!2}
 !dx.entryPoints = !{!3, !6, !13, !15}
+!lgc.rt.max.attribute.size = !{!34}
 
 !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
 !1 = !{i32 1, i32 6}
@@ -182,3 +183,4 @@ attributes #3 = { nounwind memory(read) }
 !31 = !{i32 0, %struct.TheirParams2 poison}
 !32 = !{i32 0, %struct.TraversalData poison}
 !33 = !{%struct.TraversalData poison}
+!34 = !{i32 8}
diff --git a/llvmraytracing/test/dx/continuation-without-await.ll b/llvmraytracing/test/dx/continuation-without-await.ll
index d0f0d6155b..b26552c00e 100644
--- a/llvmraytracing/test/dx/continuation-without-await.ll
+++ b/llvmraytracing/test/dx/continuation-without-await.ll
@@ -26,7 +26,7 @@ declare i32 @_cont_GetContinuationStackAddr()
 
 declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData)
 
-declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData)
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData)
 
 declare !pointeetys !16 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*)
 
@@ -53,7 +53,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i
 
 define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) !pointeetys !22 {
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data)
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
@@ -94,6 +94,7 @@ attributes #2 = { nounwind }
 !dx.valver = !{!1}
 !dx.shaderModel = !{!2}
 !dx.entryPoints = !{!3, !6, !13, !14}
+!lgc.rt.max.attribute.size = !{!29}
 
 !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
 !1 = !{i32 1, i32 6}
@@ -124,13 +125,14 @@ attributes #2 = { nounwind }
 !26 = !{i32 0, %struct.TheirParams poison}
 !27 = !{i32 0, %struct.TraversalData poison}
 !28 = !{%struct.TraversalData poison}
+!29 = !{i32 8}
 ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
 ; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    ret i32 5
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @main(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META20:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META21:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
@@ -142,15 +144,15 @@ attributes #2 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP3]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [9 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount [[META21]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } @await(ptr [[TMP5]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP6]], 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [2 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META22:![0-9]+]], !continuation.returnedRegistercount [[META22]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } @await(ptr [[TMP5]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP6]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP11]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP6]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP6]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP18]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -160,7 +162,7 @@ attributes #2 = { nounwind }
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @main_no_call(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META20]] !continuation.registercount [[META8]] !continuation [[META23:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
@@ -170,7 +172,7 @@ attributes #2 = { nounwind }
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @called(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META24:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META24:![0-9]+]] !continuation.registercount [[META18:![0-9]+]] !continuation [[META25:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [3 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
@@ -201,7 +203,7 @@ attributes #2 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr [[TMP24]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [3 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [8 x i32] poison, [3 x i32] [[TMP28]]), !continuation.registercount [[META17]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [2 x i32] poison, [3 x i32] [[TMP28]]), !continuation.registercount [[META18]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -211,23 +213,23 @@ attributes #2 = { nounwind }
 ;
 ;
 ; CLEANUP-LABEL: define void @main(
-; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.state [[META8]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META20:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META21:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 0, 0
 ; CLEANUP-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP1]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount [[META21]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP1]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META22:![0-9]+]], !continuation.returnedRegistercount [[META22]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @main.resume.0(
-; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META21]] !continuation [[META20]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [2 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META22]] !continuation [[META21]] {
 ; CLEANUP-NEXT:  entryresume.0:
-; CLEANUP-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 2
+; CLEANUP-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 2
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP3]], 0
-; CLEANUP-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 0
+; CLEANUP-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT4:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP2]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-NEXT:    ret void
@@ -236,7 +238,7 @@ attributes #2 = { nounwind }
 ;
 ;
 ; CLEANUP-LABEL: define void @main_no_call(
-; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] !continuation.state [[META8]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META20]] !continuation.registercount [[META8]] !continuation [[META23:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -246,7 +248,7 @@ attributes #2 = { nounwind }
 ;
 ;
 ; CLEANUP-LABEL: define void @called(
-; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META24:![0-9]+]] !continuation.state [[META8]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META24:![0-9]+]] !continuation.registercount [[META18:![0-9]+]] !continuation [[META25:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i32] [[PAYLOAD]], 0
 ; CLEANUP-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i32] [[PAYLOAD]], 1
@@ -257,7 +259,7 @@ attributes #2 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
 ; CLEANUP-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [8 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [2 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]), !continuation.registercount [[META18]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -267,7 +269,7 @@ attributes #2 = { nounwind }
 ;
 ;
 ; POSTPROCESS-LABEL: define void @main(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation [[META20:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META20:![0-9]+]] !continuation [[META21:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -277,18 +279,18 @@ attributes #2 = { nounwind }
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 0, 0
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP1]], i64 [[TMP2]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP1]], i64 [[TMP2]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @main.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation [[META20]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [2 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation [[META21]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 2
+; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 2
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP3]], 0
-; POSTPROCESS-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT4:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP2]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-NEXT:    ret void
@@ -297,7 +299,7 @@ attributes #2 = { nounwind }
 ;
 ;
 ; POSTPROCESS-LABEL: define void @main_no_call(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation [[META21:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META20]] !continuation [[META22:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -309,7 +311,7 @@ attributes #2 = { nounwind }
 ;
 ;
 ; POSTPROCESS-LABEL: define void @called(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation [[META23:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [3 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation [[META24:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -323,6 +325,6 @@ attributes #2 = { nounwind }
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
 ; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
 ; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP1]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [8 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP1]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [2 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll b/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll
new file mode 100644
index 0000000000..9e0aba31c2
--- /dev/null
+++ b/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll
@@ -0,0 +1,257 @@
+; Tests that if _cont_ExitRayGen ends with an enqueue, then we still free RayGen continuation state.
+; This is a regression test, in an earlier version we only freed for returns and missed this case.
+; RUN: grep -v "lgc.cps.module" %s | opt --verify-each -passes="dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck %s
+; RUN: opt --verify-each -passes="dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck %s
+
+; There is just a single RayGen shader in this module, so any free must come from it.
+; CHECK: call void @lgc.cps.free
+
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+%dx.types.Handle = type { i8* }
+%struct.DispatchSystemData = type { <3 x i32> }
+%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i64 }
+%struct.SystemData = type { %struct.DispatchSystemData }
+%struct.HitData = type { <3 x float>, <3 x float>, float, i32 }
+%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
+%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
+%struct.RayPayload = type { <4 x float> }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.BuiltInTriangleIntersectionAttributes2 = type { <2 x float> }
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
+
+@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
+@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
+
+define i32 @_cont_GetContinuationStackAddr() #0 {
+  ret i32 0
+}
+
+declare void @_AmdEnqueue(i64, i64, %struct.SystemData)
+
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
+  call void @_AmdEnqueue(i64 1, i64 1, %struct.SystemData poison)
+  unreachable
+}
+
+declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0
+
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0
+
+declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #0
+
+declare !pointeetys !32 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData* %data) #0
+
+declare !pointeetys !34 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
+
+declare !pointeetys !36 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
+
+declare !pointeetys !37 void @_cont_SetTriangleHitAttributes(%struct.SystemData* %data, %struct.BuiltInTriangleIntersectionAttributes %val)
+
+define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeetys !38 {
+  ret i32 5
+}
+
+declare i1 @opaqueIsEnd()
+
+define i1 @_cont_IsEndSearch(%struct.TraversalData*) #0 !pointeetys !40 {
+  %isEnd = call i1 @opaqueIsEnd()
+  ret i1 %isEnd
+}
+
+declare !pointeetys !42 i32 @_cont_HitKind(%struct.SystemData*) #0
+
+; Function Attrs: nounwind
+declare i64 @_AmdGetResumePointAddr() #1
+
+; Function Attrs: nounwind
+declare !pointeetys !43 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !44 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !43 void @_cont_AcceptHitAndEndSearch(%struct.DispatchSystemData* nocapture readnone) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !44 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !43 void @_cont_IgnoreHit(%struct.DispatchSystemData* nocapture readnone) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !44 void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* nocapture readnone) #1
+
+define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !45 {
+  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
+  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
+  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
+  %addr = call i64 @_AmdGetResumePointAddr() #3
+  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i64 %addr, 5
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i64 4, %struct.TraversalData %trav_data2)
+  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
+  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
+  ret void
+}
+
+declare !pointeetys !46 void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0;
+
+declare !pointeetys !47 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0
+
+declare !pointeetys !48 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData* %data)
+
+declare !pointeetys !49 <3 x float> @_cont_ObjectRayOrigin3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData)
+
+declare  !pointeetys !49 <3 x float> @_cont_ObjectRayDirection3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData)
+
+declare !pointeetys !51 float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData)
+
+declare i32 @opaque()
+declare void @use(i32)
+
+; Function Attrs: nounwind
+define void @MyRayGen() #2 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
+  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
+  %3 = alloca %struct.RayPayload, align 4
+  %4 = bitcast %struct.RayPayload* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %4) #1
+  %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
+  store <4 x float> zeroinitializer, <4 x float>* %5, align 4, !tbaa !52
+  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
+  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 })
+  ; Ensure continuation state
+  %cont.state = call i32 @opaque()
+  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)
+  call void @use(i32 %cont.state)
+  %8 = load <4 x float>, <4 x float>* %5, align 4, !tbaa !52
+  %9 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
+  %10 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1)
+  %11 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2)
+  %12 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %11, %dx.types.ResourceProperties { i32 4098, i32 1033 })
+  %13 = extractelement <4 x float> %8, i64 0
+  %14 = extractelement <4 x float> %8, i64 1
+  %15 = extractelement <4 x float> %8, i64 2
+  %16 = extractelement <4 x float> %8, i64 3
+  call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %12, i32 %9, i32 %10, i32 undef, float %13, float %14, float %15, float %16, i8 15)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %4) #1
+  ret void
+}
+
+; Function Attrs: nounwind
+declare !pointeetys !59 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #3
+
+; Function Attrs: nounwind memory(none)
+declare float @dx.op.objectRayDirection.f32(i32, i8) #3
+
+; Function Attrs: nounwind memory(none)
+declare float @dx.op.objectRayOrigin.f32(i32, i8) #3
+
+; Function Attrs: nounwind memory(read)
+declare float @dx.op.rayTCurrent.f32(i32) #4
+
+declare void @dx.op.acceptHitAndEndSearch(i32) #0
+
+declare void @dx.op.ignoreHit(i32) #0
+
+; Function Attrs: nounwind
+declare !pointeetys !60 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !61 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes2(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes2*) #1
+
+; Function Attrs: nounwind memory(none)
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #3
+
+; Function Attrs: nounwind memory(read)
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare !pointeetys !63 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare !pointeetys !63 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nounwind memory(read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+
+!llvm.ident = !{!0}
+!dx.version = !{!1}
+!dx.valver = !{!1}
+!dx.shaderModel = !{!2}
+!dx.resources = !{!3}
+!dx.typeAnnotations = !{!10}
+!dx.entryPoints = !{!18, !29 }
+!lgc.cps.module = !{}
+
+!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
+!1 = !{i32 1, i32 6}
+!2 = !{!"lib", i32 6, i32 6}
+!3 = !{!4, !7, null, null}
+!4 = !{!5}
+!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
+!6 = !{i32 0, i32 4}
+!7 = !{!8}
+!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
+!9 = !{i32 0, i32 9}
+!10 = !{i32 1, void ()* @MyRayGen, !11}
+!11 = !{!12}
+!12 = !{i32 1, !13, !13}
+!13 = !{}
+!14 = !{!12, !15, !16}
+!15 = !{i32 2, !13, !13}
+!16 = !{i32 0, !13, !13}
+!17 = !{!12, !15}
+!18 = !{null, !"", null, !3, !19}
+!19 = !{i32 0, i64 65536}
+!21 = !{i32 8, i32 9, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
+!22 = !{i32 0}
+!24 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
+!26 = !{i32 8, i32 8, i32 5, !22}
+!28 = !{i32 8, i32 11, i32 6, i32 16, i32 5, !22}
+!29 = !{void ()* @MyRayGen, !"MyRayGen", null, null, !30}
+!30 = !{i32 8, i32 7, i32 5, !22}
+!32 = !{%struct.AnyHitTraversalData poison}
+!33 = !{i32 0, %struct.AnyHitTraversalData poison}
+!34 = !{%struct.SystemData poison}
+!35 = !{i32 0, %struct.SystemData poison}
+!36 = !{%struct.SystemData poison}
+!37 = !{%struct.SystemData poison}
+!38 = !{%struct.DispatchSystemData poison}
+!39 = !{i32 0, %struct.DispatchSystemData poison}
+!40 = !{%struct.TraversalData poison}
+!41 = !{i32 0, %struct.TraversalData poison}
+!42 = !{%struct.SystemData poison}
+!43 = !{%struct.DispatchSystemData poison}
+!44 = !{%struct.AnyHitTraversalData poison}
+!45 = !{%struct.DispatchSystemData poison}
+!46 = !{%struct.DispatchSystemData poison}
+!47 = !{%struct.AnyHitTraversalData poison}
+!48 = !{%struct.DispatchSystemData poison}
+!49 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
+!50 = !{i32 0, %struct.HitData poison}
+!51 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
+!52 = !{!53, !53, i64 0}
+!53 = !{!"omnipotent char", !54, i64 0}
+!54 = !{!"Simple C/C++ TBAA"}
+!55 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
+!56 = !{i32 0, %struct.RayPayload poison}
+!57 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
+!58 = !{%struct.RayPayload poison}
+!59 = !{%struct.RayPayload poison}
+!60 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
+!61 = !{%struct.BuiltInTriangleIntersectionAttributes2 poison}
+!62 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes2 poison}
+!63 = !{i8 poison}
+!64 = !{i32 0, i8 poison}
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
index 03fef54610..e3de9f1614 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
@@ -48,7 +48,7 @@ entry:
 
 !dx.entryPoints = !{!0, !3}
 !continuation.maxPayloadRegisterCount = !{!7}
-!continuation.preservedPayloadRegisterCount = !{!8}
+!continuation.maxUsedPayloadRegisterCount = !{!8}
 
 !0 = !{null, !"", null, !1, !6}
 !1 = !{!2, null, null, null}
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll
index 6c498ff781..0c341d0fe3 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll
@@ -26,7 +26,7 @@ define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeet
 ; Need _cont_ReportHit to get system data type
 declare  !pointeetys !22 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
-declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData)
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData)
 
 declare !pointeetys !15 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*)
 
@@ -38,7 +38,7 @@ define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwin
 
 define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) !pointeetys !18 {
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data)
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
@@ -102,7 +102,7 @@ attributes #0 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP3]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [20 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [19 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } @await(ptr [[TMP8]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP9]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
@@ -136,7 +136,7 @@ attributes #0 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP3]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa19i32a1i32s(i32 2, i32 4, i32 5, [20 x i32] poison, [1 x i32] [[TMP4]]), !continuation.returnedRegistercount [[META14]], !continuation.registercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa19i32a1i32s(i32 2, i32 4, i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [19 x i32] poison, [1 x i32] [[TMP4]]), !continuation.returnedRegistercount [[META14]], !continuation.registercount [[META14]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP5]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [1 x i32] [[TMP6]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4
@@ -168,7 +168,7 @@ attributes #0 = { nounwind }
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 undef, 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP3]], i64 [[TMP4]], i32 5, [20 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP3]], i64 [[TMP4]], i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [19 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll
index 57527032aa..2c4aa40ae2 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll
@@ -28,7 +28,7 @@ declare !pointeetys !27 void @_cont_SetTriangleHitAttributes(%struct.SystemData*
 
 declare %struct.DispatchSystemData @_cont_Traversal(%struct.TraversalData) #0
 
-declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #0
+declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, i64, %struct.AnyHitTraversalData) #0
 
 declare !pointeetys !28 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
 
@@ -79,7 +79,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i
 
 define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !36 {
   %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4
-  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind)
+  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, i64 poison, %struct.AnyHitTraversalData %trav_data)
   store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4
   ret i1 true
 }
@@ -181,7 +181,7 @@ declare !pointeetys !46 [4 x <3 x float>] @_cont_WorldToObject4x3(%struct.Dispat
 ; Function Attrs: nounwind
 define void @RayGen() #3 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @RayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META29:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META18]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META30:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META18]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
@@ -190,7 +190,7 @@ define void @RayGen() #3 {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @RayGen(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META28:![0-9]+]] !continuation.entry [[META13:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META29:![0-9]+]] !continuation.entry [[META13:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -206,7 +206,7 @@ define void @RayGen() #3 {
 ; Function Attrs: nounwind
 define void @Intersection() #3 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @Intersection(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META30:![0-9]+]] !continuation [[META31:![0-9]+]] !continuation.registercount [[META25:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META31:![0-9]+]] !continuation [[META32:![0-9]+]] !continuation.registercount [[META26:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
@@ -234,11 +234,11 @@ define void @Intersection() #3 {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP11]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float 4.000000e+00, i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]], [19 x i32] poison, [30 x i32] [[TMP13]]), !continuation.registercount [[META25]], !continuation.returnedRegistercount [[META25]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } @await(ptr [[TMP20]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP22]], 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]], {} poison, [30 x i32] [[TMP13]]), !continuation.registercount [[META26]], !continuation.returnedRegistercount [[META26]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } @await(ptr [[TMP20]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP22]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP22]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP22]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP14]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
@@ -246,16 +246,16 @@ define void @Intersection() #3 {
 ; LOWERRAYTRACINGPIPELINE:       19:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], [8 x i32] poison, [30 x i32] [[TMP21]]), !continuation.registercount [[META25]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], [2 x i32] poison, [30 x i32] [[TMP21]]), !continuation.registercount [[META26]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       22:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META25]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]], [2 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META26]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @Intersection(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META29:![0-9]+]] !continuation [[META30:![0-9]+]] !continuation.stacksize [[META31:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META30:![0-9]+]] !continuation [[META31:![0-9]+]] !continuation.stacksize [[META25:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
@@ -366,7 +366,7 @@ define void @Intersection() #3 {
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_29_INSERT96:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT93]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(ptr @Intersection.resume.0)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP11]], i64 [[TMP12]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_1_INSERT]], float 4.000000e+00, i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [19 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT96]])
+; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP11]], i64 [[TMP12]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_1_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], {} poison, [30 x i32] [[DOTFCA_29_INSERT96]])
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   %1 = call float @dx.op.rayTMin.f32(i32 153)
@@ -381,39 +381,38 @@ define void @Intersection() #3 {
 ; Function Attrs: nounwind
 define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !pointeetys !47 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @AnyHit(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation [[META33:![0-9]+]] !continuation.registercount [[META26:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META33:![0-9]+]] !continuation [[META34:![0-9]+]] !continuation.registercount [[META27:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[TMP15]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP15]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP25]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP21]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP22]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP26]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP29]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP28]], ptr [[ORIGHITATTRS]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
@@ -450,22 +449,22 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] [[TMP55]], ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP42]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = load i32, ptr [[TMP41]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP59]], ptr [[TMP60]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP63]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP39]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP59]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP61]], ptr [[TMP62]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP67]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP60]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP64]], ptr [[TMP66]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP65]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP65]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP67]], ptr [[TMP63]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP69]], ptr [[TMP6]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP70]], align 4
@@ -474,27 +473,21 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP57]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP56]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP73:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP58]], [8 x i32] poison, [10 x i32] [[TMP73]]), !continuation.registercount [[META26]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = load [4 x i32], ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP58]], [2 x i32] poison, [4 x i32] [[TMP72]]), !continuation.registercount [[META27]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @AnyHit(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation [[META33:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation [[META33:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 0, 0, 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store <3 x i32> [[DOTFCA_0_0_0_0_EXTRACT]], ptr [[DOTFCA_0_0_0_0_GEP]], align 4
@@ -625,18 +618,12 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_GEP21:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_1_GEP21]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_0_INSERT]], i32 [[DOTFCA_1_1_LOAD]], 1, 1
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP23]], 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP24]], 7
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[DOTFCA_2_EXTRACT]], 8
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[DOTFCA_3_EXTRACT]], 9
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP23]], 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP24]], 1
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[DOTFCA_2_EXTRACT]], 2
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[DOTFCA_3_EXTRACT]], 3
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP30:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP30]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP30]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   %1 = call float @dx.op.rayTMin.f32(i32 153)
@@ -654,33 +641,33 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; Function Attrs: nounwind
 define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !pointeetys !47 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @ClosestHit(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META35:![0-9]+]] !continuation.registercount [[META26]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [13 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] !continuation.registercount [[META27]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[TMP13]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP13]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP18]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[SYSTEM_DATA_ALLOCA]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP16]], ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4
@@ -705,43 +692,37 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I1:%.*]] = load i32, ptr [[RESPTR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP35]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP36]], ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP38]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP31]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP35]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP49]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP38]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP47]], ptr [[TMP48]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP50]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP53]], ptr [[TMP49]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP51]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP52]], [20 x i32] poison, [10 x i32] [[TMP45]]), !continuation.registercount [[META26]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP52]], [14 x i32] poison, [4 x i32] [[TMP45]]), !continuation.registercount [[META27]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META35:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [13 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META35:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[TMP0]], 0, 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store <3 x i32> [[DOTFCA_0_0_EXTRACT]], ptr [[DOTFCA_0_0_GEP]], align 4
@@ -780,18 +761,12 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP14]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [10 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[PAYLOAD_FCA_7_EXTRACT]], 7
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[PAYLOAD_FCA_8_EXTRACT]], 8
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[PAYLOAD_FCA_9_EXTRACT]], 9
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [4 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP19]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [20 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP19]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [14 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   %1 = call float @dx.op.rayTMin.f32(i32 153)
@@ -862,6 +837,7 @@ attributes #4 = { nounwind }
 !dx.resources = !{!3}
 !dx.typeAnnotations = !{!10}
 !dx.entryPoints = !{!14, !16, !19, !21, !23}
+!lgc.rt.max.attribute.size = !{!51}
 
 !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
 !1 = !{i32 1, i32 6}
@@ -914,3 +890,4 @@ attributes #4 = { nounwind }
 !48 = !{i32 0, %struct.RayPayload poison}
 !49 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
 !50 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
+!51 = !{i32 8}
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll
index 8d69eab346..1d5f3b5b9e 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll
@@ -193,7 +193,7 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META17]], !waitmask [[META20:![0-9]+]], !continuation.returnedRegistercount [[META17]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } @await(ptr [[TMP19]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } @await.2(ptr [[TMP19]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP25]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP13]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SMALLPAYLOAD]] poison, ptr [[P1]], align 4
@@ -204,7 +204,7 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP11]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT10:%.*]]
-; LOWERRAYTRACINGPIPELINE:       .split12:
+; LOWERRAYTRACINGPIPELINE:       .split:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP16]], align 4
@@ -249,7 +249,7 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT9:%.*]]
-; LOWERRAYTRACINGPIPELINE:       .split11:
+; LOWERRAYTRACINGPIPELINE:       .split.split:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I5:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP53]], align 4
@@ -281,7 +281,7 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP52]], ptr addrspace(32) [[TMP50]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = load [2 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[TMP62]]), !continuation.registercount [[META13]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } @await.2(ptr [[TMP63]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } @await(ptr [[TMP63]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP64]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [2 x i32] [[TMP65]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_LARGEPAYLOAD]] poison, ptr [[P3]], align 4
@@ -310,7 +310,7 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP75]], ptr [[TMP53]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
-; LOWERRAYTRACINGPIPELINE:       .split:
+; LOWERRAYTRACINGPIPELINE:       .split.split.split:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = load i32, ptr [[TMP70]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP84]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll
index 95170965dc..ed75c1e686 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll
@@ -28,7 +28,7 @@ declare i32 @_cont_GetContinuationStackAddr()
 
 declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData)
 
-declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData)
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData)
 
 declare !pointeetys !13 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*)
 
@@ -49,7 +49,7 @@ declare !pointeetys !22 <3 x i32> @_cont_DispatchRaysDimensions3(%struct.Dispatc
 
 define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #1 !pointeetys !18 {
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data)
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
@@ -79,6 +79,7 @@ attributes #1 = { alwaysinline }
 !dx.shaderModel = !{!2}
 !dx.entryPoints = !{!3, !6}
 !lgc.cps.module = !{}
+!lgc.rt.max.attribute.size = !{!25}
 
 !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
 !1 = !{i32 1, i32 6}
@@ -105,6 +106,7 @@ attributes #1 = { alwaysinline }
 !22 = !{%struct.DispatchSystemData poison}
 !23 = !{i32 0, %struct.AnyHitTraversalData poison}
 !24 = !{%struct.AnyHitTraversalData poison}
+!25 = !{i32 8}
 
 ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
 ; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) {
@@ -112,7 +114,7 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @called(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META14:![0-9]+]] !continuation [[META17:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !continuation.registercount [[META15:![0-9]+]] !continuation [[META18:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
@@ -127,15 +129,15 @@ attributes #1 = { alwaysinline }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP6]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [17 x i32] poison, [1 x i32] [[TMP7]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount [[META14]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } @await(ptr [[TMP11]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP12]], 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP7]]), !continuation.registercount [[META15]], !continuation.returnedRegistercount [[META15]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } @await(ptr [[TMP11]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP12]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP13]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MYPARAMS]] poison, ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP12]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP12]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP8]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -149,7 +151,7 @@ attributes #1 = { alwaysinline }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]], [16 x i32] poison, [1 x i32] [[TMP20]]), !continuation.registercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]], [10 x i32] poison, [1 x i32] [[TMP20]]), !continuation.registercount [[META15]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -159,7 +161,7 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; CLEANUP-LABEL: define void @called(
-; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META14:![0-9]+]] !continuation [[META17:![0-9]+]] !continuation.stacksize [[META18:![0-9]+]] !continuation.state [[META18]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !continuation.registercount [[META15:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META13:![0-9]+]] !continuation.state [[META13]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -170,17 +172,17 @@ attributes #1 = { alwaysinline }
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT9]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP2]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [17 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount [[META14]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP2]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META15]], !continuation.returnedRegistercount [[META15]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @called.resume.0(
-; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [16 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META14]] !continuation [[META17]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META17]] !continuation.registercount [[META15]] !continuation [[META18]] {
 ; CLEANUP-NEXT:  entryresume.0:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
-; CLEANUP-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 2
+; CLEANUP-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 2
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP4]], 0
-; CLEANUP-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 0
+; CLEANUP-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -192,7 +194,7 @@ attributes #1 = { alwaysinline }
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]), !continuation.registercount [[META14]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]), !continuation.registercount [[META15]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -202,7 +204,7 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; POSTPROCESS-LABEL: define void @called(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation [[META17:![0-9]+]] !continuation.stacksize [[META18:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META13:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -219,23 +221,23 @@ attributes #1 = { alwaysinline }
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [17 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP6]], i64 [[TMP7]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @called.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [16 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation [[META17]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META17]] !continuation [[META18]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP3]], ptr [[SYSTEM_DATA_ALLOCA1]], align 4
 ; POSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP11]], -8
-; POSTPROCESS-NEXT:    [[TMP12:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 2
+; POSTPROCESS-NEXT:    [[TMP12:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 2
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP12]], 0
-; POSTPROCESS-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
@@ -251,7 +253,7 @@ attributes #1 = { alwaysinline }
 ; POSTPROCESS-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP9]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP10]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP10]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
@@ -261,7 +263,7 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @called(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] !continuation [[META19:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
@@ -276,14 +278,14 @@ attributes #1 = { alwaysinline }
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP5]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa16i32a1i32s(i32 2, i32 4, i32 5, [17 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP7]], 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa10i32a1i32s(i32 2, i32 4, i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META15:![0-9]+]], !continuation.returnedRegistercount [[META15]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP7]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [1 x i32] [[TMP8]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_MYPARAMS]] poison, ptr [[TMP1]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP7]], 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP7]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -297,7 +299,7 @@ attributes #1 = { alwaysinline }
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP16]], [16 x i32] poison, [1 x i32] [[TMP17]]), !continuation.registercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP16]], [10 x i32] poison, [1 x i32] [[TMP17]]), !continuation.registercount [[META15]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -307,7 +309,7 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @called(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] !continuation [[META19:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -318,33 +320,33 @@ attributes #1 = { alwaysinline }
 ; CLEANUP-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[SYSTEM_DATA_FCA_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 4, {} poison, i64 [[TMP0]], i32 5, [17 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 4, {} poison, i64 [[TMP0]], i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META15:![0-9]+]], !continuation.returnedRegistercount [[META15]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @called.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [16 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META17]] !continuation [[META18]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META17]] !lgc.cps [[META18]] !continuation [[META19]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
-; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, align 8
-; CLEANUP-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, align 8
+; CLEANUP-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
-; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], 2
+; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP6]], 0
-; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], 0
+; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[TMP5]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RETURN_ADDR_RELOAD_ADDR]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
+; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP8]])
 ; CLEANUP-CPS-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP9]], i8 0
-; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
+; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP10]])
 ; CLEANUP-CPS-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP11]], i8 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT10]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META15]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -354,7 +356,7 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @called(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [16 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META16:![0-9]+]] !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META17:![0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !lgc.cps [[META19:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -371,31 +373,31 @@ attributes #1 = { alwaysinline }
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP4]], i64 [[TMP5]], i32 5, [17 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP4]], i64 [[TMP5]], i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define dso_local void @called.resume.0(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [16 x i32], [1 x i32] } [[TMP3:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] !lgc.cps [[META18]] {
+; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP3:%.*]]) !continuation [[META17]] !lgc.rt.shaderstage [[META18]] !lgc.cps [[META19]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
-; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, align 8
+; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; POSTPROCESS-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], -8
-; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], 2
+; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 2
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP7]], 0
-; POSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] } [[TMP3]], 0
+; POSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP8]], 0
 ; POSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(21)
 ; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP9]], i32 0
 ; POSTPROCESS-CPS-NEXT:    [[RETURN_ADDR_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP10]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
+; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP11]])
 ; POSTPROCESS-CPS-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP12]], i8 0
-; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [16 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
+; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP13]])
 ; POSTPROCESS-CPS-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP14]], i8 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT10]], 0
@@ -405,6 +407,6 @@ attributes #1 = { alwaysinline }
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP16]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP17:%.*]] = zext i32 [[RETURN_ADDR_RELOAD]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP17]], i32 [[TMP18]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP17]], i32 [[TMP18]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline.ll b/llvmraytracing/test/dx/lower-rt-pipeline.ll
index e595b929d8..54e4b0cdc6 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline.ll
@@ -35,7 +35,7 @@ declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalDat
 
 declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0
 
-declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #0
+declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, i64, %struct.AnyHitTraversalData) #0
 
 define %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData* %data) #0 !pointeetys !32 {
   %resPtr = getelementptr %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, i32 0, i32 0
@@ -115,7 +115,7 @@ define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hi
 
 callAHit:                                         ; preds = %0
   %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4
-  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind)
+  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, i64 poison, %struct.AnyHitTraversalData %trav_data)
   store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4
   call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data)
   ret i1 true
@@ -351,6 +351,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !dx.typeAnnotations = !{!10}
 !dx.entryPoints = !{!18, !20, !23, !25, !27, !29, !31}
 !lgc.cps.module = !{}
+!lgc.rt.max.attribute.size = !{!65}
 
 !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
 !1 = !{i32 1, i32 6}
@@ -417,6 +418,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !62 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes2 poison}
 !63 = !{i8 poison}
 !64 = !{i32 0, i8 poison}
+!65 = !{i32 8}
 ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetContinuationStackAddr(
 ; LOWERRAYTRACINGPIPELINE-SAME: ) #[[ATTR0:[0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    ret i32 0
@@ -497,9 +499,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META22]] !continuation [[META35:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META22]] !continuation [[META36:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
@@ -508,7 +510,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA36:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]])
@@ -518,53 +520,53 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr [[TMP37]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP38]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [16 x i32] poison, [10 x i32] [[TMP39]]), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount [[META33]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } @await(ptr [[TMP40]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP41]], 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[TMP42]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP43]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load [4 x i32], ptr [[TMP37]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [4 x i32] [[TMP31]]), !continuation.registercount [[META34:![0-9]+]], !continuation.returnedRegistercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } @await(ptr [[TMP39]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP41]], 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [4 x i32] [[TMP42]], ptr [[TMP37]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP37]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP22]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP44]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP45]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP27]], ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP41]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP47]], ptr [[TMP43]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP41]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP19]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA36]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP30]], i8 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP46]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP28]], i64 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP28]], i64 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP28]], i64 2
@@ -576,30 +578,30 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META40:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META41:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[TMP39]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP6]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP41]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP42]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP13]], ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP2]], align 4
@@ -622,29 +624,29 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP28]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP30]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP32]], ptr [[TMP43]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP29]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr [[TMP39]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP34]], ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP36]], ptr [[TMP40]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP36]], ptr [[TMP46]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP48]], ptr [[TMP40]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [33 x i32] poison, [10 x i32] [[TMP45]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [4 x i32], ptr [[TMP39]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [27 x i32] poison, [4 x i32] [[TMP45]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META42:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META43:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -655,32 +657,31 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP24]], ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP26]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP25]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP24]], ptr [[TMP27]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP22]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP11]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP42]], ptr [[ORIGHITATTRS]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
@@ -727,174 +728,172 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = fcmp fast ogt float [[TMP34]], 1.000000e+00
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP34]], -1.000000e+00
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP35]], label [[TMP38:%.*]], label [[TMP73:%.*]]
-; LOWERRAYTRACINGPIPELINE:       42:
+; LOWERRAYTRACINGPIPELINE:       41:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP56:%.*]]
-; LOWERRAYTRACINGPIPELINE:       43:
+; LOWERRAYTRACINGPIPELINE:       42:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP40]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = load i32, ptr [[TMP41]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP46]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP49]], ptr [[TMP43]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP52]], ptr [[TMP50]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = load i32, ptr [[TMP62]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP65]], ptr [[TMP47]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP66]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = load i32, ptr [[TMP68]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP59]], ptr [[TMP81]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = load i32, ptr [[TMP41]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP59]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP48]], ptr [[TMP46]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP50]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP51]], ptr [[TMP49]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = load i32, ptr [[TMP60]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP63]], ptr [[TMP52]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP64]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP81]], ptr [[TMP80]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP53]], ptr [[TMP54]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]], [8 x i32] poison, [10 x i32] [[TMP63]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]], [2 x i32] poison, [4 x i32] [[TMP62]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE:       64:
+; LOWERRAYTRACINGPIPELINE:       63:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP57]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP58]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP67]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP69]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP82]], ptr [[TMP60]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP86:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP87:%.*]] = load i32, ptr [[TMP86]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP87]], ptr [[TMP85]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP88]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP97]], ptr [[TMP64]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP105:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP105]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP78]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP80]], ptr [[TMP111]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP58]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP66]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = load i32, ptr [[TMP68]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP69]], ptr [[TMP67]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP87:%.*]] = load i32, ptr [[TMP84]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP87]], ptr [[TMP82]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP74]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP75]], ptr [[TMP88]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP103:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP103]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP104]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP79]], ptr [[TMP78]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP9]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP70]], ptr [[TMP71]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP72]], [8 x i32] poison, [10 x i32] [[TMP84]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP83:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP72]], [2 x i32] poison, [4 x i32] [[TMP83]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
+; LOWERRAYTRACINGPIPELINE:       84:
+; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP37]], label [[TMP85:%.*]], label [[TMP128:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       85:
-; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP37]], label [[TMP74:%.*]], label [[TMP109:%.*]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP36]], label [[TMP86:%.*]], label [[TMP109:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       86:
-; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP36]], label [[TMP75:%.*]], label [[TMP92:%.*]]
-; LOWERRAYTRACINGPIPELINE:       87:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP76]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP113:%.*]] = load i32, ptr [[TMP77]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP113]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP119]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP127]], ptr [[TMP79]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP128:%.*]] = getelementptr inbounds i32, ptr [[TMP79]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[TMP119]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP129:%.*]] = load i32, ptr [[TMP95]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP129]], ptr [[TMP128]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[TMP79]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP98:%.*]] = getelementptr inbounds i32, ptr [[TMP119]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP98]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP99]], ptr [[TMP83]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP101]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP102:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP102]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP104]], ptr [[TMP103]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP105]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP126]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP92]], ptr [[TMP125]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP144]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP95]], ptr [[TMP129]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP97]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP98]], ptr [[TMP96]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP99:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP99]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP102:%.*]] = load i32, ptr [[TMP100]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP102]], ptr [[TMP101]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP89:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP89]], ptr [[TMP90]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP91:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP132:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP91]], [8 x i32] poison, [10 x i32] [[TMP132]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP145:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP91]], [2 x i32] poison, [4 x i32] [[TMP145]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE:       109:
+; LOWERRAYTRACINGPIPELINE:       107:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP93]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP134:%.*]] = load i32, ptr [[TMP94]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP134]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP115:%.*]] = load i32, ptr [[TMP114]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP115]], ptr [[TMP96]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[TMP96]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP114]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP118:%.*]] = load i32, ptr [[TMP117]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP118]], ptr [[TMP140]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP96]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[TMP114]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP121]], ptr [[TMP100]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP148:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP148]], ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP149]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP126]], ptr [[TMP125]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP146:%.*]] = load i32, ptr [[TMP94]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP146]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP113:%.*]] = load i32, ptr [[TMP112]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP113]], ptr [[TMP111]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP116]], ptr [[TMP114]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP118:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP119:%.*]] = load i32, ptr [[TMP118]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP119]], ptr [[TMP117]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP120]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP149:%.*]] = load i32, ptr [[TMP121]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP149]], ptr [[TMP148]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP106:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP106]], ptr [[TMP107]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP108:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP130:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP108]], [8 x i32] poison, [10 x i32] [[TMP130]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP127:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP108]], [2 x i32] poison, [4 x i32] [[TMP127]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE:       131:
+; LOWERRAYTRACINGPIPELINE:       128:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP133:%.*]] = load i32, ptr [[TMP110]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP133]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP130:%.*]] = load i32, ptr [[TMP110]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP130]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP131:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP133:%.*]] = load i32, ptr [[TMP132]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP133]], ptr [[TMP131]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP136]], ptr [[TMP112]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[TMP112]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[TMP135]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP136]], ptr [[TMP134]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP139:%.*]] = load i32, ptr [[TMP138]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP139]], ptr [[TMP137]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i32, ptr [[TMP112]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP141:%.*]] = getelementptr inbounds i32, ptr [[TMP135]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP142:%.*]] = load i32, ptr [[TMP141]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP142]], ptr [[TMP116]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP143:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP143]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP145:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP146:%.*]] = load i32, ptr [[TMP144]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP146]], ptr [[TMP145]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP140:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP140]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP141:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP142:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP143:%.*]] = load i32, ptr [[TMP141]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP143]], ptr [[TMP142]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP122:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP6]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP122]], ptr [[TMP123]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP124:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP150:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP124]], [8 x i32] poison, [10 x i32] [[TMP150]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP147:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP124]], [2 x i32] poison, [4 x i32] [[TMP147]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META44:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation.registercount [[META33:![0-9]+]] !continuation [[META45:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -919,11 +918,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], [32 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } @await.1(ptr [[TMP23]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP24]], 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], {} poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } @await.1(ptr [[TMP23]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP26]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP24]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP10]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -947,18 +946,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       23:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [2 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       26:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyIntersectionShader2(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43]] !continuation.registercount [[META32]] !continuation [[META45:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META44]] !continuation.registercount [[META33]] !continuation [[META46:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2:%.*]], align 4
@@ -983,11 +982,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP8]], [32 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } @await.2(ptr [[TMP23]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP24]], 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP8]], {} poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } @await.2(ptr [[TMP23]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP26]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP24]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP10]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -1011,61 +1010,61 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       23:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [2 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       26:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META47:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !continuation.registercount [[META34]] !continuation [[META47:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[TMP23]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP26]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr [[TMP15]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP12]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP27]], ptr [[TMP23]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr [[TMP28]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr [[TMP24]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr [[TMP30]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP32]], ptr [[TMP24]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP21]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], [33 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load [4 x i32], ptr [[TMP23]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], [27 x i32] poison, [4 x i32] [[TMP29]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -1149,9 +1148,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
@@ -1160,7 +1159,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA38:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]])
@@ -1170,46 +1169,46 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP11]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP20]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa33i32a10i32s(i32 4, i32 8, i32 5, [36 x i32] poison, [10 x i32] [[TMP21]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP22]], 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 3
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP27]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load [4 x i32], ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa27i32a4i32s(i32 4, i32 8, i32 5, [30 x i32] poison, [4 x i32] [[TMP21]]), !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP22]], 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [4 x i32] [[TMP23]], ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP12]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP35]], ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP22]], 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP47]], ptr [[TMP45]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP22]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP24]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       .split:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA38]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
@@ -1227,29 +1226,29 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40:![0-9]+]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41:![0-9]+]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP4]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP7]], ptr [[TMP3]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP13]], ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP13]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP34]], ptr [[TMP11]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP14]], ptr [[TMP1]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -1273,28 +1272,28 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP28]], ptr [[TMP29]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP31]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP34]], ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP31]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP37]], ptr [[TMP35]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP37]], ptr [[TMP32]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP40]], ptr [[TMP38]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP40]], ptr [[TMP35]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP45]], ptr [[TMP38]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP41]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP42]], [33 x i32] poison, [10 x i32] [[TMP43]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = load [4 x i32], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP42]], [27 x i32] poison, [4 x i32] [[TMP43]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META34]] !continuation [[META44:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -1305,31 +1304,30 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[TMP13]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP13]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP16]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[TMP24]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP22]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP9]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP25]], ptr [[ORIGHITATTRS]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
@@ -1376,174 +1374,172 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], 1.000000e+00
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP35]], -1.000000e+00
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP82:%.*]]
-; LOWERRAYTRACINGPIPELINE-CPS:       39:
+; LOWERRAYTRACINGPIPELINE-CPS:       38:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP30]], ptr [[TMP29]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP37]], label [[TMP40:%.*]], label [[TMP61:%.*]]
-; LOWERRAYTRACINGPIPELINE-CPS:       40:
+; LOWERRAYTRACINGPIPELINE-CPS:       39:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP41]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP43]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = load i32, ptr [[TMP45]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP46]], ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP49]], ptr [[TMP47]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP52]], ptr [[TMP50]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP53]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP54]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP56]], ptr [[TMP55]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP43]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP45]], ptr [[TMP56]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP48]], ptr [[TMP46]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP50]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP51]], ptr [[TMP49]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP52]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP53]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP55]], ptr [[TMP54]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP57:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP57]], ptr [[TMP58]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP59:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP60:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP59]], [8 x i32] poison, [10 x i32] [[TMP60]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP60:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP59]], [2 x i32] poison, [4 x i32] [[TMP60]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE-CPS:       61:
+; LOWERRAYTRACINGPIPELINE-CPS:       60:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP62]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP64]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP66]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP67]], ptr [[TMP65]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[TMP65]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP70]], ptr [[TMP68]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP65]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP72]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP73]], ptr [[TMP71]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP74:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP74]], ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP75]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP77]], ptr [[TMP76]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP63]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP77]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP66]], ptr [[TMP64]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP69:%.*]] = load i32, ptr [[TMP68]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP69]], ptr [[TMP67]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP71]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP72]], ptr [[TMP70]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP73:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP73]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP74]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP76]], ptr [[TMP75]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP78:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP78]], ptr [[TMP79]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP80:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP81:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP80]], [8 x i32] poison, [10 x i32] [[TMP81]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP81:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP80]], [2 x i32] poison, [4 x i32] [[TMP81]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
+; LOWERRAYTRACINGPIPELINE-CPS:       81:
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP38]], label [[TMP84:%.*]], label [[TMP141:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       82:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP38]], label [[TMP83:%.*]], label [[TMP128:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP37]], label [[TMP83:%.*]], label [[TMP105:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       83:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP37]], label [[TMP84:%.*]], label [[TMP106:%.*]]
-; LOWERRAYTRACINGPIPELINE-CPS:       84:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP85]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP87:%.*]] = load i32, ptr [[TMP86]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP87]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP89]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP90]], ptr [[TMP88]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[TMP88]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP92:%.*]] = getelementptr inbounds i32, ptr [[TMP89]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP92]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP93]], ptr [[TMP91]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[TMP88]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[TMP89]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP96]], ptr [[TMP94]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP98:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP98]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP101:%.*]] = load i32, ptr [[TMP99]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP101]], ptr [[TMP100]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP100:%.*]] = load i32, ptr [[TMP86]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP100]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP88]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP89]], ptr [[TMP87]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP91]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP92]], ptr [[TMP90]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP94]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP95]], ptr [[TMP93]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP96:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP96]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP98:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP97]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP99]], ptr [[TMP98]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP102:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP6]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP102]], ptr [[TMP103]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP104:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP105:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP104]], [8 x i32] poison, [10 x i32] [[TMP105]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP106:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP104]], [2 x i32] poison, [4 x i32] [[TMP106]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE-CPS:       106:
+; LOWERRAYTRACINGPIPELINE-CPS:       104:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP107]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP108]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP109]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP108]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP112:%.*]] = load i32, ptr [[TMP111]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP112]], ptr [[TMP110]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP113:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP111]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP115:%.*]] = load i32, ptr [[TMP114]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP115]], ptr [[TMP113]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP111]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP118:%.*]] = load i32, ptr [[TMP117]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP118]], ptr [[TMP116]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP120:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP120]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP122:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP121]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP123]], ptr [[TMP122]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP108]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP121]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP122:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i32, ptr [[TMP108]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP109]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP110]], ptr [[TMP122]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[TMP108]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP113:%.*]] = load i32, ptr [[TMP112]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP113]], ptr [[TMP111]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[TMP108]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP116]], ptr [[TMP114]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP117:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP117]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP118:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP118]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP120]], ptr [[TMP119]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP124:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP124]], ptr [[TMP125]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP126:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP127:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP126]], [8 x i32] poison, [10 x i32] [[TMP127]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP142:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP126]], [2 x i32] poison, [4 x i32] [[TMP142]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE-CPS:       128:
+; LOWERRAYTRACINGPIPELINE-CPS:       125:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP30]], ptr [[TMP29]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP130:%.*]] = load i32, ptr [[TMP129]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP130]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP131:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP129]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP127]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP128:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP143:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP130:%.*]] = load i32, ptr [[TMP143]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP130]], ptr [[TMP128]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP131:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP133:%.*]] = load i32, ptr [[TMP132]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP133]], ptr [[TMP131]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i32, ptr [[TMP131]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP132]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i32 3
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP136]], ptr [[TMP134]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[TMP131]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[TMP132]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP139:%.*]] = load i32, ptr [[TMP138]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP139]], ptr [[TMP137]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP140:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP140]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP141:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP142:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP143:%.*]] = load i32, ptr [[TMP141]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP143]], ptr [[TMP142]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP137:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP137]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP140:%.*]] = load i32, ptr [[TMP138]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP140]], ptr [[TMP139]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP144:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP144]], ptr [[TMP145]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP146:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP147:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP146]], [8 x i32] poison, [10 x i32] [[TMP147]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP147:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP146]], [2 x i32] poison, [4 x i32] [[TMP147]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -1568,10 +1564,10 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [32 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa2i32a30i32s(i32 3, i32 16, i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], {} poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP9]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP9]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP11]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -1595,18 +1591,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS:       21:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       24:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [8 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [2 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader2(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META45]] !continuation [[META47:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META45]] !continuation [[META47:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2:%.*]], align 4
@@ -1631,10 +1627,10 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP7]], [32 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa2i32a30i32s(i32 3, i32 16, i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP7]], {} poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP9]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP9]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP11]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -1658,60 +1654,60 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS:       21:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       24:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [8 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [2 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META41]] !continuation [[META48:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !lgc.cps [[META42]] !continuation [[META48:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = alloca [4 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP6]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP6]], ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP9]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP12]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP12]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP13]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP24]], ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP29]], ptr [[TMP22]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [33 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [4 x i32], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [27 x i32] poison, [4 x i32] [[TMP27]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -1795,7 +1791,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-LABEL: define void @MyRayGen(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META35:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META36:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1819,40 +1815,28 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 3
 ; POSTPROCESS-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
-; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP11]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1
-; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2
-; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4
-; POSTPROCESS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5
-; POSTPROCESS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6
-; POSTPROCESS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP7]], 7
-; POSTPROCESS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP8]], 8
-; POSTPROCESS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP9]], 9
+; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP11]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP7]], 1
+; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP8]], 2
+; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP9]], 3
 ; POSTPROCESS-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 4, i32 [[TMP10]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [16 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 4, i32 [[TMP10]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @MyRayGen.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META35]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [4 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META36]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[TMP19:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[TMP19]], align 4
-; POSTPROCESS-NEXT:    [[TMP16:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP1]], 2
-; POSTPROCESS-NEXT:    [[TMP10:%.*]] = extractvalue [10 x i32] [[TMP16]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 1
-; POSTPROCESS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 2
-; POSTPROCESS-NEXT:    [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 3
-; POSTPROCESS-NEXT:    [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 4
-; POSTPROCESS-NEXT:    [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 5
-; POSTPROCESS-NEXT:    [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP16]], 6
-; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue [10 x i32] [[TMP16]], 7
-; POSTPROCESS-NEXT:    [[TMP5:%.*]] = extractvalue [10 x i32] [[TMP16]], 8
-; POSTPROCESS-NEXT:    [[TMP7:%.*]] = extractvalue [10 x i32] [[TMP16]], 9
+; POSTPROCESS-NEXT:    [[TMP16:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP1]], 2
+; POSTPROCESS-NEXT:    [[TMP10:%.*]] = extractvalue [4 x i32] [[TMP16]], 0
+; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue [4 x i32] [[TMP16]], 1
+; POSTPROCESS-NEXT:    [[TMP5:%.*]] = extractvalue [4 x i32] [[TMP16]], 2
+; POSTPROCESS-NEXT:    [[TMP7:%.*]] = extractvalue [4 x i32] [[TMP16]], 3
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP10]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
@@ -1861,7 +1845,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_4_VEC_INSERT]], float [[TMP6]], i32 2
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP7]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP8]], i32 3
-; POSTPROCESS-NEXT:    [[TMP17:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    [[TMP17:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP17]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-NEXT:    [[TMP18:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
@@ -1896,21 +1880,15 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-LABEL: define void @MyClosestHitShader(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[TMP0]], 0, 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; POSTPROCESS-NEXT:    store <3 x i32> [[DOTFCA_0_0_EXTRACT]], ptr [[DOTFCA_0_0_GEP]], align 4
@@ -1954,37 +1932,25 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP23]], i32 0, i32 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP19]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP20]], 7
-; POSTPROCESS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8
-; POSTPROCESS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9
+; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP19]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT1]], i32 [[TMP20]], 1
+; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP21]], 2
+; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP22]], 3
 ; POSTPROCESS-NEXT:    [[TMP28:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP28]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP28]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define void @MyAnyHitShader(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 0, 0, 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0
 ; POSTPROCESS-NEXT:    store <3 x i32> [[DOTFCA_0_0_0_0_EXTRACT]], ptr [[DOTFCA_0_0_0_0_GEP]], align 4
@@ -2178,18 +2144,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_GEP236:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP236]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT]], i32 [[DOTFCA_1_3_LOAD]], 1, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP22]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP23]], 7
-; POSTPROCESS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP24]], 8
-; POSTPROCESS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP25]], 9
+; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP22]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT1]], i32 [[TMP23]], 1
+; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP24]], 2
+; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP25]], 3
 ; POSTPROCESS-NEXT:    [[TMP38:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP38]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP38]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       32:
 ; POSTPROCESS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -2254,18 +2214,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_GEP111:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD112:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP111]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT113:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT110]], i32 [[DOTFCA_1_3_LOAD112]], 1, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT61:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP41]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT64:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT61]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT67:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT64]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT70:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT67]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-NEXT:    [[DOTFCA_4_INSERT73:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT70]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-NEXT:    [[DOTFCA_5_INSERT76:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT73]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-NEXT:    [[DOTFCA_6_INSERT79:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT76]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-NEXT:    [[DOTFCA_7_INSERT82:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT79]], i32 [[TMP35]], 7
-; POSTPROCESS-NEXT:    [[DOTFCA_8_INSERT85:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT82]], i32 [[TMP36]], 8
-; POSTPROCESS-NEXT:    [[DOTFCA_9_INSERT88:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT85]], i32 [[TMP37]], 9
+; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT62:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP41]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT65:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT62]], i32 [[TMP35]], 1
+; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT68:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT65]], i32 [[TMP36]], 2
+; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT71:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT68]], i32 [[TMP37]], 3
 ; POSTPROCESS-NEXT:    [[TMP52:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP52]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT113]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT88]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP52]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT113]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT71]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       44:
 ; POSTPROCESS-NEXT:    br i1 [[TMP18]], label [[TMP53:%.*]], label [[TMP71:%.*]]
@@ -2330,18 +2284,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_GEP152:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD153:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP152]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT154:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT151]], i32 [[DOTFCA_1_3_LOAD153]], 1, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT91:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP48]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT94:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT91]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT97:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT94]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT100:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT97]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-NEXT:    [[DOTFCA_4_INSERT103:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT100]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-NEXT:    [[DOTFCA_5_INSERT106:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT103]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-NEXT:    [[DOTFCA_6_INSERT109:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT106]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-NEXT:    [[DOTFCA_7_INSERT112:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT109]], i32 [[TMP49]], 7
-; POSTPROCESS-NEXT:    [[DOTFCA_8_INSERT115:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT112]], i32 [[TMP50]], 8
-; POSTPROCESS-NEXT:    [[DOTFCA_9_INSERT118:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT115]], i32 [[TMP51]], 9
+; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT74:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP48]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT77:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT74]], i32 [[TMP49]], 1
+; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT80:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT77]], i32 [[TMP50]], 2
+; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT83:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT80]], i32 [[TMP51]], 3
 ; POSTPROCESS-NEXT:    [[TMP55:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP55]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT154]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT118]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP55]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT154]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT83]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       56:
 ; POSTPROCESS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -2402,18 +2350,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_GEP193:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD194:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP193]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT195:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT192]], i32 [[DOTFCA_1_3_LOAD194]], 1, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT121:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP64]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT124:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT121]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT127:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT124]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT130:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT127]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-NEXT:    [[DOTFCA_4_INSERT133:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT130]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-NEXT:    [[DOTFCA_5_INSERT136:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT133]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-NEXT:    [[DOTFCA_6_INSERT139:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT136]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-NEXT:    [[DOTFCA_7_INSERT142:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT139]], i32 [[TMP59]], 7
-; POSTPROCESS-NEXT:    [[DOTFCA_8_INSERT145:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT142]], i32 [[TMP60]], 8
-; POSTPROCESS-NEXT:    [[DOTFCA_9_INSERT148:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT145]], i32 [[TMP61]], 9
+; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT86:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP64]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT89:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT86]], i32 [[TMP59]], 1
+; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT92:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT89]], i32 [[TMP60]], 2
+; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT95:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT92]], i32 [[TMP61]], 3
 ; POSTPROCESS-NEXT:    [[TMP65:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP65]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT195]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT148]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP65]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT195]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT95]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       66:
 ; POSTPROCESS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
@@ -2477,23 +2419,17 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_GEP234:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD235:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP234]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT236:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT233]], i32 [[DOTFCA_1_3_LOAD235]], 1, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT151:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP72]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT154:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT151]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT157:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT154]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT160:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT157]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-NEXT:    [[DOTFCA_4_INSERT163:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT160]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-NEXT:    [[DOTFCA_5_INSERT166:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT163]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-NEXT:    [[DOTFCA_6_INSERT169:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT166]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-NEXT:    [[DOTFCA_7_INSERT172:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT169]], i32 [[TMP73]], 7
-; POSTPROCESS-NEXT:    [[DOTFCA_8_INSERT175:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT172]], i32 [[TMP69]], 8
-; POSTPROCESS-NEXT:    [[DOTFCA_9_INSERT178:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT175]], i32 [[TMP70]], 9
+; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT98:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP72]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT101:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT98]], i32 [[TMP73]], 1
+; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT104:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT101]], i32 [[TMP69]], 2
+; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT107:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT104]], i32 [[TMP70]], 3
 ; POSTPROCESS-NEXT:    [[TMP80:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP80]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT236]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT178]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP80]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT236]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT107]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define void @MyIntersectionShader(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] !continuation.stacksize [[META42:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] !continuation.stacksize [[META32:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2608,7 +2544,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       isEnd.i:
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -2674,7 +2610,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP16]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT351]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT351]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       18:
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -2724,18 +2660,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP20]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META40]] !continuation [[META41]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META41]] !continuation [[META42]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP15]], -8
-; POSTPROCESS-NEXT:    [[TMP16:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP1]], 2
+; POSTPROCESS-NEXT:    [[TMP16:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP1]], 2
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 1
 ; POSTPROCESS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 2
@@ -2766,7 +2702,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 27
 ; POSTPROCESS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 28
 ; POSTPROCESS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 29
-; POSTPROCESS-NEXT:    [[TMP17:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    [[TMP17:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT16:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 0, 0, 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT18:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 1, 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_1_1_EXTRACT20:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 1, 1
@@ -2834,7 +2770,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP7]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       12:
 ; POSTPROCESS-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
@@ -2887,12 +2823,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP13]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define void @MyIntersectionShader2(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META42]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META32]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3007,7 +2943,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader2.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       isEnd.i:
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -3073,7 +3009,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP16]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT351]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT351]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       18:
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -3123,18 +3059,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP20]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @MyIntersectionShader2.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META40]] !continuation [[META43]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META41]] !continuation [[META43]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP15]], -8
-; POSTPROCESS-NEXT:    [[TMP16:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP1]], 2
+; POSTPROCESS-NEXT:    [[TMP16:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP1]], 2
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 1
 ; POSTPROCESS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 2
@@ -3165,7 +3101,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 27
 ; POSTPROCESS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 28
 ; POSTPROCESS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP16]], 29
-; POSTPROCESS-NEXT:    [[TMP17:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    [[TMP17:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT16:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 0, 0, 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT18:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 1, 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_1_1_EXTRACT20:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], 0, 1, 1
@@ -3233,7 +3169,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP7]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       12:
 ; POSTPROCESS-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
@@ -3286,25 +3222,19 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP13]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define void @MyMissShader(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META44:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; POSTPROCESS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[TMP0]], 0, 0
 ; POSTPROCESS-NEXT:    [[TMP1:%.*]] = bitcast i32 [[PAYLOAD_FCA_0_EXTRACT]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
@@ -3324,18 +3254,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, i32 3
 ; POSTPROCESS-NEXT:    [[TMP12:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP5]], 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT1]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP6]], 7
-; POSTPROCESS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP7]], 8
-; POSTPROCESS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP12]], 9
+; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP5]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT1]], i32 [[TMP6]], 1
+; POSTPROCESS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP7]], 2
+; POSTPROCESS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP12]], 3
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP13]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 [[TMP13]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
@@ -3419,7 +3343,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyRayGen(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -3441,36 +3365,24 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 3
 ; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP7]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1
-; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6
-; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP8]], 7
-; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8
-; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 8, {} poison, i64 [[TMP6]], i32 5, [36 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP7]], 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP8]], 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP9]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP10]], 3
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 8, {} poison, i64 [[TMP6]], i32 5, [30 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META35]] !continuation [[META36]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [4 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
-; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, align 8
-; CLEANUP-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], ptr [[TMP4]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 1
-; CLEANUP-CPS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 6
-; CLEANUP-CPS-NEXT:    [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 7
-; CLEANUP-CPS-NEXT:    [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 8
-; CLEANUP-CPS-NEXT:    [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 9
+; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, align 8
+; CLEANUP-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 3
 ; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = bitcast i32 [[DOTFCA_0_EXTRACT]] to float
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0
 ; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = bitcast i32 [[DOTFCA_7_EXTRACT]] to float
@@ -3479,11 +3391,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_4_VEC_INSERT]], float [[TMP8]], i32 2
 ; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = bitcast i32 [[DOTFCA_9_EXTRACT]] to float
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP9]], i32 3
-; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], 0
+; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP10]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0
+; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[RES_1_I1:%.*]] = load i32, ptr [[TMP12]], align 4
 ; CLEANUP-CPS-NEXT:    [[RESPTR_2_I2:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP12]], i32 0, i32 0, i32 1
 ; CLEANUP-CPS-NEXT:    [[RES_2_I3:%.*]] = load i32, ptr [[RESPTR_2_I2]], align 4
@@ -3493,7 +3405,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[VAL_1_I7:%.*]] = insertelement <3 x i32> [[VAL_0_I6]], i32 [[RES_2_I3]], i32 1
 ; CLEANUP-CPS-NEXT:    [[VAL_2_I8:%.*]] = insertelement <3 x i32> [[VAL_1_I7]], i32 [[RES_3_I5]], i32 2
 ; CLEANUP-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[VAL_2_I8]], i8 0
-; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0
+; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[RES_1_I:%.*]] = load i32, ptr [[TMP13]], align 4
 ; CLEANUP-CPS-NEXT:    [[RESPTR_2_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP13]], i32 0, i32 0, i32 1
 ; CLEANUP-CPS-NEXT:    [[RES_2_I:%.*]] = load i32, ptr [[RESPTR_2_I]], align 4
@@ -3514,19 +3426,13 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyClosestHitShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37:![0-9]+]] !lgc.cps [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    store <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_GEP]], align 4
@@ -3569,34 +3475,22 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP21]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT10:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP17]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP18]], 7
-; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP19]], 8
-; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP20]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP17]], 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP18]], 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP19]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP20]], 3
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyAnyHitShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META34]] !continuation [[META41:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 0, 0
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    store <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_0_0_GEP]], align 4
@@ -3789,17 +3683,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT]], i32 [[DOTFCA_1_3_LOAD]], 1, 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP19]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP20]], 7
-; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8
-; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP19]], 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP20]], 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP21]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP22]], 3
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       28:
 ; CLEANUP-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -3864,17 +3752,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_GEP262:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_LOAD263:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP262]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_INSERT264:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT261]], i32 [[DOTFCA_1_3_LOAD263]], 1, 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT62:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP30]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT65:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT62]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT68:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT65]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT71:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT68]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_4_INSERT74:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT71]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_5_INSERT77:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT74]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_6_INSERT80:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT77]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT83:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT80]], i32 [[TMP31]], 7
-; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT86:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT83]], i32 [[TMP32]], 8
-; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT89:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT86]], i32 [[TMP33]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT264]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT89]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT62:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP30]], 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT65:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT62]], i32 [[TMP31]], 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT68:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT65]], i32 [[TMP32]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT71:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT68]], i32 [[TMP33]], 3
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT264]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT71]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       39:
 ; CLEANUP-CPS-NEXT:    br i1 [[TMP15]], label [[TMP40:%.*]], label [[TMP59:%.*]]
@@ -3939,17 +3821,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_GEP303:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_LOAD304:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP303]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_INSERT305:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT302]], i32 [[DOTFCA_1_3_LOAD304]], 1, 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT92:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP43]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT95:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT92]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT98:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT95]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT101:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT98]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_4_INSERT104:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT101]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_5_INSERT107:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT104]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_6_INSERT110:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT107]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT113:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT110]], i32 [[TMP44]], 7
-; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT116:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT113]], i32 [[TMP45]], 8
-; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT119:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT116]], i32 [[TMP46]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT305]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT119]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT74:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP43]], 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT77:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT74]], i32 [[TMP44]], 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT80:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT77]], i32 [[TMP45]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT83:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT80]], i32 [[TMP46]], 3
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT305]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT83]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       50:
 ; CLEANUP-CPS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -4010,17 +3886,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_GEP344:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_LOAD345:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP344]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_INSERT346:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT343]], i32 [[DOTFCA_1_3_LOAD345]], 1, 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT122:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP52]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT125:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT122]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT128:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT125]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT131:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT128]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_4_INSERT134:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT131]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_5_INSERT137:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT134]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_6_INSERT140:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT137]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT143:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT140]], i32 [[TMP53]], 7
-; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT146:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT143]], i32 [[TMP54]], 8
-; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT149:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT146]], i32 [[TMP55]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT346]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT149]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT86:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP52]], 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT89:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT86]], i32 [[TMP53]], 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT92:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT89]], i32 [[TMP54]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT95:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT92]], i32 [[TMP55]], 3
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT346]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT95]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       59:
 ; CLEANUP-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
@@ -4084,22 +3954,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_GEP385:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_LOAD386:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP385]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_3_INSERT387:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT384]], i32 [[DOTFCA_1_3_LOAD386]], 1, 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT152:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP60]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT155:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT152]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT158:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT155]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT161:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT158]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_4_INSERT164:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT161]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_5_INSERT167:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT164]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_6_INSERT170:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT167]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT173:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT170]], i32 [[TMP61]], 7
-; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT176:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT173]], i32 [[TMP62]], 8
-; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT179:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT176]], i32 [[TMP63]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT387]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT179]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT98:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP60]], 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT101:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT98]], i32 [[TMP61]], 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT104:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT101]], i32 [[TMP62]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT107:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT104]], i32 [[TMP63]], 3
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT387]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT107]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -4208,7 +4072,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       isEnd.i:
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -4271,7 +4135,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       9:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -4318,15 +4182,15 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META43]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META43]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
-; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
+; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 2
@@ -4357,7 +4221,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 27
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 29
-; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 0
+; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 0, 0, 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 1, 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 1, 1
@@ -4421,7 +4285,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       8:
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -4470,12 +4334,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader2(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER2_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -4584,7 +4448,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader2.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       isEnd.i:
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -4647,7 +4511,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       9:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -4694,15 +4558,15 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader2.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
-; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
+; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 2
@@ -4733,7 +4597,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 27
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 29
-; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 0
+; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 0, 0, 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 1, 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP6]], 0, 1, 1
@@ -4797,7 +4661,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       8:
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER2_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -4846,23 +4710,17 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyMissShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META38]] !continuation [[META45:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !lgc.cps [[META39]] !continuation [[META45:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = bitcast i32 [[PAYLOAD_FCA_0_EXTRACT]] to float
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
@@ -4882,17 +4740,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, i32 3
 ; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP4]], 0
-; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; CLEANUP-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP5]], 7
-; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP6]], 8
-; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP7]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP4]], 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP5]], 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP6]], 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP7]], 3
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -4976,7 +4828,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyRayGen(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5000,39 +4852,27 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP12]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP11]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP12]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP11]], 1
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP9]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP10]], 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 4, i32 [[TMP13]], i64 [[TMP8]], i32 5, [36 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 4, i32 [[TMP13]], i64 [[TMP8]], i32 5, [30 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META35]] !continuation [[META36]] {
+; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [4 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
-; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, align 8
+; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], ptr [[TMP4]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 1
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 4
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 5
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 9
+; POSTPROCESS-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 1
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = bitcast i32 [[DOTFCA_0_EXTRACT]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = bitcast i32 [[DOTFCA_7_EXTRACT]] to float
@@ -5041,11 +4881,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_4_VEC_INSERT]], float [[TMP8]], i32 2
 ; POSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = bitcast i32 [[DOTFCA_9_EXTRACT]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP9]], i32 3
-; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], 0
+; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP10]], 0
 ; POSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0
+; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    [[RES_1_I1:%.*]] = load i32, ptr [[TMP12]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[RESPTR_2_I2:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP12]], i32 0, i32 0, i32 1
 ; POSTPROCESS-CPS-NEXT:    [[RES_2_I3:%.*]] = load i32, ptr [[RESPTR_2_I2]], align 4
@@ -5055,7 +4895,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[VAL_1_I7:%.*]] = insertelement <3 x i32> [[VAL_0_I6]], i32 [[RES_2_I3]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[VAL_2_I8:%.*]] = insertelement <3 x i32> [[VAL_1_I7]], i32 [[RES_3_I5]], i32 2
 ; POSTPROCESS-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[VAL_2_I8]], i8 0
-; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0
+; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    [[RES_1_I:%.*]] = load i32, ptr [[TMP13]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[RESPTR_2_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP13]], i32 0, i32 0, i32 1
 ; POSTPROCESS-CPS-NEXT:    [[RES_2_I:%.*]] = load i32, ptr [[RESPTR_2_I]], align 4
@@ -5076,21 +4916,15 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyClosestHitShader(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37:![0-9]+]] !lgc.cps [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    store <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_GEP]], align 4
@@ -5133,38 +4967,26 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP21]], i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT10:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP17]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP18]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP19]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP20]], 9
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP17]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP18]], 1
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP19]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP20]], 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP24:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP24]], i32 [[TMP25]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP24]], i32 [[TMP25]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyAnyHitShader(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META34:![0-9]+]] !continuation [[META41:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 0, 0
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    store <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_0_0_GEP]], align 4
@@ -5357,31 +5179,25 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT]], i32 [[DOTFCA_1_3_LOAD]], 1, 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP19]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP20]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP19]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP20]], 1
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP21]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP22]], 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP30:%.*]] = zext i32 [[RETURNADDR]] to i64
-; POSTPROCESS-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP30]], i32 [[TMP31]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; POSTPROCESS-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP30]], i32 [[TMP29]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       30:
 ; POSTPROCESS-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP33]])
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT25:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0
-; POSTPROCESS-CPS-NEXT:    [[TMP34:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT25]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP36:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT25]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT34:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
-; POSTPROCESS-CPS-NEXT:    [[TMP35:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT34]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP37:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT34]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT42:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-CPS-NEXT:    [[TMP36:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT42]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP34:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT42]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT52:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
-; POSTPROCESS-CPS-NEXT:    [[TMP37:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT52]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP35:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT52]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP38:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP39:%.*]] = bitcast i32 [[TMP38]] to float
@@ -5434,19 +5250,13 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_GEP261:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_LOAD262:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP261]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_INSERT263:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT260]], i32 [[DOTFCA_1_3_LOAD262]], 1, 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT61:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP34]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT64:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT61]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT67:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT64]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT70:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT67]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT73:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT70]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT76:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT73]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT79:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT76]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT82:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT79]], i32 [[TMP35]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT85:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT82]], i32 [[TMP36]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT88:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT85]], i32 [[TMP37]], 9
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT62:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP36]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT65:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT62]], i32 [[TMP37]], 1
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT68:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT65]], i32 [[TMP34]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT71:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT68]], i32 [[TMP35]], 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP45:%.*]] = zext i32 [[RETURNADDR]] to i64
-; POSTPROCESS-CPS-NEXT:    [[TMP46:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP45]], i32 [[TMP46]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT263]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT88]])
+; POSTPROCESS-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP45]], i32 [[TMP43]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT263]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT71]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       43:
 ; POSTPROCESS-CPS-NEXT:    br i1 [[TMP15]], label [[TMP48:%.*]], label [[TMP75:%.*]]
@@ -5511,31 +5321,25 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_GEP302:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_LOAD303:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP302]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_INSERT304:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT301]], i32 [[DOTFCA_1_3_LOAD303]], 1, 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT91:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP51]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT94:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT91]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT97:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT94]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT100:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT97]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT103:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT100]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT106:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT103]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT109:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT106]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT112:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT109]], i32 [[TMP52]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT115:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT112]], i32 [[TMP53]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT118:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT115]], i32 [[TMP54]], 9
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT74:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP51]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT77:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT74]], i32 [[TMP52]], 1
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT80:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT77]], i32 [[TMP53]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT83:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT80]], i32 [[TMP54]], 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP60:%.*]] = zext i32 [[RETURNADDR]] to i64
-; POSTPROCESS-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP60]], i32 [[TMP61]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT304]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT118]])
+; POSTPROCESS-CPS-NEXT:    [[TMP64:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP60]], i32 [[TMP64]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT304]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT83]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       56:
 ; POSTPROCESS-CPS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP63]])
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT29:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0
-; POSTPROCESS-CPS-NEXT:    [[TMP64:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT29]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP58:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT29]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT38:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
-; POSTPROCESS-CPS-NEXT:    [[TMP65:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT38]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP59:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT38]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT46:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-CPS-NEXT:    [[TMP66:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT46]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP65:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT46]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT56:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
-; POSTPROCESS-CPS-NEXT:    [[TMP67:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT56]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP61:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT56]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP68:%.*]] = bitcast i32 [[TMP6]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0404_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP68]], i32 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP69:%.*]] = bitcast i32 [[TMP7]] to float
@@ -5584,30 +5388,24 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_GEP343:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_LOAD344:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP343]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_INSERT345:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT342]], i32 [[DOTFCA_1_3_LOAD344]], 1, 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT121:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP64]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT124:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT121]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT127:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT124]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT130:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT127]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT133:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT130]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT136:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT133]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT139:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT136]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT142:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT139]], i32 [[TMP65]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT145:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT142]], i32 [[TMP66]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT148:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT145]], i32 [[TMP67]], 9
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT86:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP58]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT89:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT86]], i32 [[TMP59]], 1
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT92:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT89]], i32 [[TMP65]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT95:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT92]], i32 [[TMP61]], 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP73:%.*]] = zext i32 [[RETURNADDR]] to i64
-; POSTPROCESS-CPS-NEXT:    [[TMP74:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP73]], i32 [[TMP74]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT345]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT148]])
+; POSTPROCESS-CPS-NEXT:    [[TMP66:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP73]], i32 [[TMP66]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT345]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT95]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       67:
 ; POSTPROCESS-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT31:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0
-; POSTPROCESS-CPS-NEXT:    [[TMP76:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT31]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP72:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT31]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT40:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
-; POSTPROCESS-CPS-NEXT:    [[TMP77:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT40]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP74:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT40]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT48:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-CPS-NEXT:    [[TMP78:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT48]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP76:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT48]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT58:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
-; POSTPROCESS-CPS-NEXT:    [[TMP79:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT58]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP71:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT58]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP80:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP81:%.*]] = bitcast i32 [[TMP80]] to float
@@ -5660,24 +5458,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_GEP384:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_LOAD385:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP384]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_3_INSERT386:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT383]], i32 [[DOTFCA_1_3_LOAD385]], 1, 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT151:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP76]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT154:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT151]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT157:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT154]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT160:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT157]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT163:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT160]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT166:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT163]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT169:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT166]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT172:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT169]], i32 [[TMP77]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT175:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT172]], i32 [[TMP78]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT178:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT175]], i32 [[TMP79]], 9
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT98:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP72]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT101:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT98]], i32 [[TMP74]], 1
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT104:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT101]], i32 [[TMP76]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT107:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT104]], i32 [[TMP71]], 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP87:%.*]] = zext i32 [[RETURNADDR]] to i64
-; POSTPROCESS-CPS-NEXT:    [[TMP88:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP87]], i32 [[TMP88]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT386]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT178]])
+; POSTPROCESS-CPS-NEXT:    [[TMP78:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP87]], i32 [[TMP78]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT386]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT107]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyIntersectionShader(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5792,7 +5584,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       isEnd.i:
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -5859,7 +5651,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP15]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP16]], i32 [[TMP17]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP16]], i32 [[TMP17]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       18:
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -5910,18 +5702,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP20]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP21]], i32 [[TMP22]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP21]], i32 [[TMP22]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META43]] {
+; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META43]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -8
-; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
+; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 1
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 2
@@ -5952,7 +5744,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29
-; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 0
+; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 0, 0, 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 1, 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 1, 1
@@ -6021,7 +5813,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP12]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = zext i32 [[RETURN_ADDR_RELOAD2]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP13]], i32 [[TMP14]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP13]], i32 [[TMP14]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       15:
 ; POSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(21)
@@ -6075,12 +5867,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP19]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP20:%.*]] = zext i32 [[RETURN_ADDR_RELOAD]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP20]], i32 [[TMP21]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP20]], i32 [[TMP21]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyIntersectionShader2(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -6195,7 +5987,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader2.resume.0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       isEnd.i:
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -6262,7 +6054,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP15]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP16]], i32 [[TMP17]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP16]], i32 [[TMP17]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       18:
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -6313,18 +6105,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP20]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP21]], i32 [[TMP22]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP21]], i32 [[TMP22]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShader2.resume.0(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44]] {
+; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -8
-; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
+; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 1
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 2
@@ -6355,7 +6147,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29
-; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 0
+; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 0, 0, 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 1, 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP7]], 0, 1, 1
@@ -6424,7 +6216,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP12]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = zext i32 [[RETURN_ADDR_RELOAD2]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP13]], i32 [[TMP14]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP13]], i32 [[TMP14]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       15:
 ; POSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(21)
@@ -6478,25 +6270,19 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP19]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP20:%.*]] = zext i32 [[RETURN_ADDR_RELOAD]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP20]], i32 [[TMP21]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP20]], i32 [[TMP21]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyMissShader(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META38]] !continuation [[META45:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !lgc.cps [[META39]] !continuation [[META45:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 3
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 4
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 5
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 6
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 7
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
-; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2
+; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP0:%.*]] = bitcast i32 [[PAYLOAD_FCA_0_EXTRACT]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
@@ -6516,18 +6302,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, i32 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP4]], 0
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP5]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP6]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP7]], 9
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[TMP4]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP5]], 1
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP6]], 2
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP7]], 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP10]], i32 [[TMP11]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]])
+; POSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[TMP10]], i32 [[TMP11]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]])
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/payload-save-registers.ll b/llvmraytracing/test/dx/payload-save-registers.ll
index 21a0ee7046..1107306d2a 100644
--- a/llvmraytracing/test/dx/payload-save-registers.ll
+++ b/llvmraytracing/test/dx/payload-save-registers.ll
@@ -31,209 +31,231 @@ declare !pointeetys !48 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data,
 ; Function Attrs: nounwind
 define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !pointeetys !23 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @Miss(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META25:![0-9]+]] !continuation.registercount [[META23:![0-9]+]] !continuation [[META26:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [4 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META26:![0-9]+]] !continuation.registercount [[META24:![0-9]+]] !continuation [[META27:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [37 x i32], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_OUTERPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = load ptr addrspace(32), ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP35]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP38]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP44]], ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP12]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP51]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP27]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP67]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP68]], ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP36]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP18]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP69]], ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP25]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP64]], ptr [[TMP20]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP21]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP31]], ptr [[TMP22]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP66]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP29]], ptr [[TMP22]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = load i32, ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP70]], ptr [[TMP24]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP31]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP68]], ptr [[TMP24]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP37]], ptr [[TMP26]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP70]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP35]], ptr [[TMP26]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP72]], ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP38]], ptr [[TMP28]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP29]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr [[TMP30]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP72]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr [[TMP30]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP74]], ptr [[TMP32]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP44]], ptr [[TMP32]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP76]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP75]], ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP74]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP76]], ptr [[TMP34]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP78]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP81]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP83:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP86:%.*]] = load i32, ptr [[TMP83]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP90:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP90]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP56]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP107:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP107]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP60]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP82:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP83:%.*]] = load i32, ptr [[TMP82]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP82]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP85:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP85]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP113:%.*]] = load i32, ptr [[TMP85]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP55]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP92:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP92]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = load i32, ptr [[TMP92]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP141:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = load i32, ptr [[TMP141]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP141]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP142:%.*]] = load i32, ptr [[TMP40]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP40]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP63]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP65]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = alloca [[STRUCT_INNERPAYLOAD:%.*]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = bitcast ptr [[TMP46]] to ptr
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP47]]) #[[ATTR0]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA27:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP116:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA28:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP71]], ptr [[TMP50]], align 4, !tbaa [[TBAA27]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP73:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP45]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP73]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
+; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP116]], ptr [[TMP50]], align 4, !tbaa [[TBAA28]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP45]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP84]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP52]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP54]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP57]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP78]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP119:%.*]] = load i32, ptr [[TMP57]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP119]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP79]]), !continuation.registercount [[META31:![0-9]+]], !continuation.returnedRegistercount [[META31]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } @await(ptr [[TMP80]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP81]], 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP122:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [4 x i32] poison, [1 x i32] [[TMP79]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } @await(ptr [[TMP122]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } [[TMP125]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP61]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_INNERPAYLOAD]] poison, ptr [[TMP46]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP84]], ptr [[TMP59]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP81]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP128:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP128]], ptr [[TMP59]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } [[TMP125]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP58]], ptr [[TMP54]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP86:%.*]] = load float, ptr [[TMP50]], align 4, !tbaa [[TBAA27]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP86]], ptr [[TMP48]], align 4, !tbaa [[TBAA27]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP97:%.*]] = load float, ptr [[TMP50]], align 4, !tbaa [[TBAA28]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP97]], ptr [[TMP48]], align 4, !tbaa [[TBAA28]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP47]]) #[[ATTR0]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP77]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP98:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP80]], ptr [[TMP98]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP99:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP51]], ptr [[TMP99]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP131:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP86]], ptr [[TMP131]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP96]], ptr [[TMP101]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP102:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP105]], ptr [[TMP102]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP134:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP108]], ptr [[TMP134]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP110]], ptr [[TMP104]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP87:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP83]], ptr [[TMP87]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP111]], ptr [[TMP87]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP90]], ptr [[TMP88]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP113]], ptr [[TMP88]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP56]], ptr [[TMP89]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP67]], ptr [[TMP89]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP96]], ptr [[TMP39]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP69]], ptr [[TMP39]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP91:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP60]], ptr [[TMP91]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP71]], ptr [[TMP91]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP142]], ptr [[TMP42]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP73]], ptr [[TMP42]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP93:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP64]], ptr [[TMP93]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP75]], ptr [[TMP93]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP66]], ptr [[TMP49]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP144:%.*]] = load ptr addrspace(32), ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP77]], ptr [[TMP49]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP62]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP98]], ptr [[TMP97]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP114:%.*]] = load i32, ptr [[TMP62]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP114]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP101]], ptr [[TMP99]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP102:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = load i32, ptr [[TMP100]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP117]], ptr [[TMP137]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP103]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP104]], ptr [[TMP102]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP103]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP120]], ptr [[TMP140]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP106]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP107]], ptr [[TMP105]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP108:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP106]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP123]], ptr [[TMP146]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP109]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP110]], ptr [[TMP108]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP109]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP126]], ptr [[TMP149]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP152:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP113:%.*]] = load i32, ptr [[TMP112]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP113]], ptr [[TMP111]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 6
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP129:%.*]] = load i32, ptr [[TMP112]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP129]], ptr [[TMP152]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP155:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP116]], ptr [[TMP114]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 7
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP132:%.*]] = load i32, ptr [[TMP115]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP132]], ptr [[TMP155]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP157:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP118:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP119:%.*]] = load i32, ptr [[TMP118]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP119]], ptr [[TMP117]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP135:%.*]] = load i32, ptr [[TMP118]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP135]], ptr [[TMP157]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP158:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP121]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP122]], ptr [[TMP120]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP123:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 9
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP138:%.*]] = load i32, ptr [[TMP121]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP138]], ptr [[TMP158]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP124]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP125]], ptr [[TMP123]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 10
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP160:%.*]] = load i32, ptr [[TMP124]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP160]], ptr [[TMP159]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP142:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP127:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP128:%.*]] = load i32, ptr [[TMP127]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP128]], ptr [[TMP126]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 11
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP144:%.*]] = load i32, ptr [[TMP127]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP144]], ptr [[TMP142]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP145:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP131]], ptr [[TMP129]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 12
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP147:%.*]] = load i32, ptr [[TMP130]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP147]], ptr [[TMP145]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP133:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP134:%.*]] = load i32, ptr [[TMP133]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP134]], ptr [[TMP132]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 13
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP150:%.*]] = load i32, ptr [[TMP133]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP150]], ptr [[TMP148]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP137:%.*]] = load i32, ptr [[TMP136]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP137]], ptr [[TMP135]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[TMP97]], i32 14
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP153:%.*]] = load i32, ptr [[TMP136]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP153]], ptr [[TMP151]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP140:%.*]] = load i32, ptr [[TMP139]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP140]], ptr [[TMP138]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP156:%.*]] = load i32, ptr [[TMP139]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP156]], ptr [[TMP154]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP94]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP143:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], [10 x i32] poison, [30 x i32] [[TMP143]]), !continuation.registercount [[META23]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], [4 x i32] poison, [30 x i32] [[TMP143]]), !continuation.registercount [[META24]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
   %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4
@@ -256,7 +278,7 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !poi
 ; Function Attrs: nounwind
 define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeetys !23 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @Callable(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [10 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation.registercount [[META23]] !continuation [[META33:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [4 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0]] !lgc.rt.shaderstage [[META33:![0-9]+]] !continuation.registercount [[META24]] !continuation [[META34:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_OUTERPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4
@@ -624,9 +646,9 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP488:%.*]] = load i32, ptr [[TMP270]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP488]], ptr [[TMP269]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP272:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP489:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [11 x i32] poison, [30 x i32] [[TMP272]]), !continuation.registercount [[META23]], !continuation.returnedRegistercount [[META23]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP274:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [30 x i32] } @await.1(ptr [[TMP489]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP490:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [30 x i32] } [[TMP274]], 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP489:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [4 x i32] poison, [30 x i32] [[TMP272]]), !continuation.registercount [[META24]], !continuation.returnedRegistercount [[META24]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP274:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } @await.1(ptr [[TMP489]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP490:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } [[TMP274]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP490]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_OUTERPAYLOAD]] poison, ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0
@@ -748,7 +770,7 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP363:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP364:%.*]] = load i32, ptr [[TMP363]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP364]], ptr [[TMP275]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP223:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [30 x i32] } [[TMP274]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP223:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } [[TMP274]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP223]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -934,7 +956,7 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP484]], ptr [[TMP482]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP382:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP486:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], [10 x i32] poison, [30 x i32] [[TMP486]]), !continuation.registercount [[META23]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], [4 x i32] poison, [30 x i32] [[TMP486]]), !continuation.registercount [[META24]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
   %1 = alloca %struct.OuterPayload, align 8
@@ -1138,10 +1160,7 @@ declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.type
 declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #3
 
 ; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #3
-
-; Function Attrs: alwaysinline
-declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #3
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) #3
 
 ; Function Attrs: alwaysinline
 declare !pointeetys !32 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #3
@@ -1184,7 +1203,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i
 ; Function Attrs: alwaysinline
 define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #3 !pointeetys !45 {
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data)
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
@@ -1210,6 +1229,7 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !dx.typeAnnotations = !{!7}
 !dx.dxrPayloadAnnotations = !{!12}
 !dx.entryPoints = !{!17, !18, !21}
+!lgc.rt.max.attribute.size = !{!49}
 
 !0 = !{!"dxcoob 2019.05.00"}
 !1 = !{i32 1, i32 7}
@@ -1260,3 +1280,4 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !46 = !{i8 poison}
 !47 = !{i32 0, i8 poison}
 !48 = !{%struct.AnyHitTraversalData poison}
+!49 = !{i32 8}
diff --git a/llvmraytracing/test/dx/remat-intrinsic.ll b/llvmraytracing/test/dx/remat-intrinsic.ll
index e51fe74b9a..5a538b179f 100644
--- a/llvmraytracing/test/dx/remat-intrinsic.ll
+++ b/llvmraytracing/test/dx/remat-intrinsic.ll
@@ -23,7 +23,7 @@ declare i32 @_cont_GetContinuationStackAddr()
 
 declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData)
 
-declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData)
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData)
 
 declare !pointeetys !14 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*)
 
@@ -39,7 +39,7 @@ define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeet
 
 define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) !pointeetys !20 {
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data)
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
@@ -93,6 +93,7 @@ attributes #1 = { nounwind }
 !dx.shaderModel = !{!2}
 !dx.entryPoints = !{!3, !6}
 !continuation.maxPayloadRegisterCount = !{!13}
+!lgc.rt.max.attribute.size = !{!26}
 
 !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
 !1 = !{i32 1, i32 6}
@@ -120,13 +121,14 @@ attributes #1 = { nounwind }
 !23 = !{%struct.MyParams poison}
 !24 = !{i32 0, %struct.TraversalData poison}
 !25 = !{%struct.TraversalData poison}
+!26 = !{i32 8}
 ; POSTPROCESS-LABEL: define i32 @_cont_GetLocalRootIndex(
 ; POSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR1:[0-9]+]] {
 ; POSTPROCESS-NEXT:    ret i32 5
 ;
 ;
 ; POSTPROCESS-LABEL: define void @called(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META16:![0-9]+]] !lgc.rt.shaderstage [[META17:![0-9]+]] !continuation.stacksize [[META18:![0-9]+]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META17:![0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation.stacksize [[META14:![0-9]+]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -151,23 +153,23 @@ attributes #1 = { nounwind }
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 2, i32 [[TMP7]], i64 [[TMP8]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @called.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP1:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [2 x i32], [1 x i32] } [[TMP1:%.*]]) !continuation [[META17]] !lgc.rt.shaderstage [[META18]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[TMP16:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP3]], ptr [[TMP16]], align 4
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP13]], -8
-; POSTPROCESS-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 2
+; POSTPROCESS-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 2
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP4]], 0
-; POSTPROCESS-NEXT:    [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP15]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
@@ -200,6 +202,6 @@ attributes #1 = { nounwind }
 ; POSTPROCESS-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP12]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [8 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]])
+; POSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP12]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]])
 ; POSTPROCESS-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/traceray.ll b/llvmraytracing/test/dx/traceray.ll
index eac4b2f7d3..b6b73db5bb 100644
--- a/llvmraytracing/test/dx/traceray.ll
+++ b/llvmraytracing/test/dx/traceray.ll
@@ -34,7 +34,7 @@ declare %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i64, i64, %struct.Tra
 
 declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0
 
-declare %struct.TraversalData @_AmdAwaitAnyHit(i64, %struct.TraversalData, float, i32) #0
+declare %struct.TraversalData @_AmdAwaitAnyHit(i64, i64, %struct.TraversalData) #0
 
 declare void @lgc.cps.jump(...) #0
 
@@ -122,7 +122,7 @@ define i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
 
 anyhit:                                           ; preds = %0
   %trav_data = load %struct.TraversalData, %struct.TraversalData* %data, align 4
-  %newdata = call %struct.TraversalData @_AmdAwaitAnyHit(i64 3, %struct.TraversalData %trav_data, float %t, i32 %hitKind)
+  %newdata = call %struct.TraversalData @_AmdAwaitAnyHit(i64 3, i64 poison, %struct.TraversalData %trav_data)
   store %struct.TraversalData %newdata, %struct.TraversalData* %data, align 4
   call void @_AmdRestoreSystemDataAnyHit(%struct.TraversalData* %data)
   ret i1 true
@@ -319,6 +319,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !dx.entryPoints = !{!18, !20, !23, !25, !27, !29, !31}
 !lgc.cps.module = !{}
 !continuation.stackAddrspace = !{!70} ; SKIP_GLOBAL_ADDRSPACE
+!lgc.rt.max.attribute.size = !{!71}
 
 !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
 !1 = !{i32 1, i32 6}
@@ -391,6 +392,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !68 = !{i32 4}
 !69 = !{i32 7}
 !70 = !{i32 22}
+!71 = !{i32 32} ; Intentionally allow more than the max used (7) so we can test that the actually used size is used.
 
 ; LOWERRAYTRACINGPIPELINE-LABEL: define i1 @_cont_IsEndSearch(
 ; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -418,7 +420,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @_cont_KernelEntry(
-; LOWERRAYTRACINGPIPELINE-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_AmdContStackSetPtr(i32 [[CSPINIT]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison), !continuation.registercount [[META22]]
@@ -440,7 +442,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META36:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
@@ -451,7 +453,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA38:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]])
@@ -476,7 +478,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP18]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP41]]), !continuation.registercount [[META33:![0-9]+]], !waitmask [[META40:![0-9]+]], !continuation.returnedRegistercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP41]]), !continuation.registercount [[META34:![0-9]+]], !waitmask [[META41:![0-9]+]], !continuation.returnedRegistercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } @await(ptr [[TMP42]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP43]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[TMP24]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
@@ -501,7 +503,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA38]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
@@ -519,7 +521,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META42:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META43:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
@@ -583,12 +585,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP44]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]], [21 x i32] poison, [10 x i32] [[TMP47]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]], [21 x i32] poison, [10 x i32] [[TMP47]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META44:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META45:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -685,7 +687,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP58]], ptr [[ADDR_I1]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP60]], [8 x i32] poison, [10 x i32] [[TMP68]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP60]], [8 x i32] poison, [10 x i32] [[TMP68]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       59:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP25]], ptr [[TMP24]], align 4
@@ -717,12 +719,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP79]], ptr [[ADDR_I2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP81]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP81]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META46:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation.registercount [[META33:![0-9]+]] !continuation [[META47:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -745,7 +747,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], [20 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], [6 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } @await.1(ptr [[TMP13]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP19]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP25]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
@@ -773,18 +775,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       22:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       25:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP23]], [8 x i32] poison, [30 x i32] [[TMP27]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP23]], [8 x i32] poison, [30 x i32] [[TMP27]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShaderLargeAttrs(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45]] !continuation.registercount [[META32]] !continuation [[META47:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46]] !continuation.registercount [[META33]] !continuation [[META48:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4
@@ -821,7 +823,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_LARGEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP8]], [15 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP8]], [1 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } @await.2(ptr [[TMP13]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP34]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP35]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
@@ -868,18 +870,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       36:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP31]], [8 x i32] poison, [30 x i32] [[TMP38]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP31]], [8 x i32] poison, [30 x i32] [[TMP38]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       39:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP33]], [8 x i32] poison, [30 x i32] [[TMP41]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP33]], [8 x i32] poison, [30 x i32] [[TMP41]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META48:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META49:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META49:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META50:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
@@ -922,7 +924,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP27]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], [21 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], [21 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -952,7 +954,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @_cont_KernelEntry(
-; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -976,7 +978,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyRayGen(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META38:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1016,7 +1018,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyRayGen.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META37]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META38]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -1063,7 +1065,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyClosestHitShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1131,7 +1133,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyAnyHitShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -1365,7 +1367,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyIntersectionShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META44:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META45:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1462,7 +1464,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
+; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS:       accepthit.i:
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -1570,7 +1572,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META43]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META44]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1718,7 +1720,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META44]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !continuation [[META46:![0-9]+]] !continuation.stacksize [[META45]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1821,7 +1823,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShaderLargeAttrs.resume.0)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
+; DXILCONTPOSTPROCESS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS:       accepthit.i:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = bitcast i32 100 to float
@@ -1925,7 +1927,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META45]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META46]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2073,7 +2075,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyMissShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2148,7 +2150,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @_cont_KernelEntry(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
@@ -2174,7 +2176,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyRayGen(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META38:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2216,7 +2218,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyRayGen.resume.0(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META37]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META38]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -2265,7 +2267,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyClosestHitShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2335,7 +2337,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyAnyHitShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -2571,7 +2573,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyIntersectionShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META44:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META45:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2669,7 +2671,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-GLOBAL:       accepthit.i:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -2777,7 +2779,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META43]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META44]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2925,7 +2927,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META44]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !continuation [[META46:![0-9]+]] !continuation.stacksize [[META45]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3029,7 +3031,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShaderLargeAttrs.resume.0)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-GLOBAL:       accepthit.i:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = bitcast i32 100 to float
@@ -3133,7 +3135,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META45]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META46]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3281,7 +3283,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyMissShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3358,7 +3360,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @_cont_KernelEntry(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META36:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_AmdContStackSetPtr(i32 [[CSPINIT]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison)
@@ -3380,7 +3382,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
@@ -3391,7 +3393,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA38:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA39:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]])
@@ -3416,7 +3418,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP20]], ptr [[TMP18]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 8, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP21]]), !waitmask [[META41:![0-9]+]], !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 8, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP21]]), !waitmask [[META42:![0-9]+]], !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP22]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
@@ -3440,7 +3442,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       .split:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA38]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA39]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
@@ -3458,7 +3460,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
@@ -3521,12 +3523,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP40]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP41]], [21 x i32] poison, [10 x i32] [[TMP42]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP41]], [21 x i32] poison, [10 x i32] [[TMP42]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44]] !lgc.cps [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -3622,7 +3624,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP52]], ptr [[ADDR_I1]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP54]], [8 x i32] poison, [10 x i32] [[TMP55]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP54]], [8 x i32] poison, [10 x i32] [[TMP55]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       56:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
@@ -3654,12 +3656,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP72]], ptr [[ADDR_I2]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP74:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP75:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP74]], [8 x i32] poison, [10 x i32] [[TMP75]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP74]], [8 x i32] poison, [10 x i32] [[TMP75]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META48:![0-9]+]] !continuation [[META49:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -3682,7 +3684,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [20 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [6 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0
@@ -3709,18 +3711,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS:       20:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP22]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP22]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       23:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP24]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP24]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META47]] !continuation [[META49:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META48]] !continuation [[META50:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4
@@ -3757,7 +3759,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_LARGEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP7]], [15 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP7]], [1 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0
@@ -3803,18 +3805,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS:       34:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP35]], [8 x i32] poison, [30 x i32] [[TMP36]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP35]], [8 x i32] poison, [30 x i32] [[TMP36]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       37:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP38]], [8 x i32] poison, [30 x i32] [[TMP39]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP38]], [8 x i32] poison, [30 x i32] [[TMP39]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45]] !lgc.cps [[META43]] !continuation [[META50:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46]] !lgc.cps [[META44]] !continuation [[META51:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
@@ -3856,7 +3858,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [21 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [21 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -3886,7 +3888,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @_cont_KernelEntry(
-; CLEANUP-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] {
+; CLEANUP-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META36:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; CLEANUP-CPS-NEXT:    call void @_AmdContStackSetPtr(i32 [[CSPINIT]])
 ; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison)
@@ -3908,7 +3910,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyRayGen(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -3940,12 +3942,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP8]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !waitmask [[META38:![0-9]+]], !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !waitmask [[META39:![0-9]+]], !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META37]] !continuation [[META38]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, align 8
 ; CLEANUP-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP3]], ptr [[TMP4]], align 4
@@ -3989,7 +3991,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyClosestHitShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
@@ -4049,12 +4051,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP17]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP18]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP19]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyAnyHitShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; CLEANUP-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -4218,7 +4220,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP20]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT73]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT73]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       28:
 ; CLEANUP-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
@@ -4278,12 +4280,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT49:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT46]], i32 [[TMP30]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT52:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT49]], i32 [[TMP31]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT55:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT52]], i32 [[TMP32]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT99]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT55]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT99]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT55]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -4374,7 +4376,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT327]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT327]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       accepthit.i:
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -4429,7 +4431,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       6:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_EXTRACT]], 0, 0, 0
@@ -4471,12 +4473,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44]] !continuation [[META45]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45]] !continuation [[META46]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
@@ -4564,7 +4566,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       8:
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -4608,12 +4610,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44]] !continuation [[META46:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45]] !continuation [[META47:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADERLARGEATTRS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -4710,7 +4712,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShaderLargeAttrs.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       accepthit.i:
 ; CLEANUP-CPS-NEXT:    [[TMP1:%.*]] = bitcast i32 100 to float
@@ -4761,7 +4763,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       4:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_EXTRACT]], 0, 0, 0
@@ -4803,12 +4805,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44]] !continuation [[META46]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45]] !continuation [[META47]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
@@ -4896,7 +4898,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD6]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD6]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       8:
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADERLARGEATTRS_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -4940,12 +4942,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyMissShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META40]] !continuation [[META47:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META41]] !continuation [[META48:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
@@ -4987,7 +4989,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP5]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP6]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP7]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -5017,7 +5019,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @_cont_KernelEntry(
-; DXILCONTPOSTPROCESS-CPS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META35:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5041,7 +5043,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyRayGen(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5081,7 +5083,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META37]] !continuation [[META38]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, align 8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -5127,7 +5129,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyClosestHitShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5196,7 +5198,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyAnyHitShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -5431,7 +5433,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyIntersectionShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5528,7 +5530,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT326]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
+; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT326]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-CPS:       accepthit.i:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -5638,7 +5640,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META44]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META45]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5788,7 +5790,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META45:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META46:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5891,7 +5893,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShaderLargeAttrs.resume.0)
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
+; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (...) @lgc.ilcps.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]])
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-CPS:       accepthit.i:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = bitcast i32 100 to float
@@ -5997,7 +5999,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META45]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META46]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -6147,7 +6149,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyMissShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META39]] !continuation [[META46:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META40]] !continuation [[META47:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
diff --git a/llvmraytracing/test/dx/traversal-empty-payload.ll b/llvmraytracing/test/dx/traversal-empty-payload.ll
index 2e3a304308..86118f8d91 100644
--- a/llvmraytracing/test/dx/traversal-empty-payload.ll
+++ b/llvmraytracing/test/dx/traversal-empty-payload.ll
@@ -10,7 +10,7 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 %struct.SystemData = type { %struct.DispatchSystemData, float }
 %struct.DispatchSystemData = type { i32 }
 
-!continuation.preservedPayloadRegisterCount = !{!8} ; EMPTY_PAYLOAD
+!continuation.maxUsedPayloadRegisterCount = !{!8} ; EMPTY_PAYLOAD
 
 declare !pointeetys !4 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
diff --git a/llvmraytracing/test/dx/traversal-passthrough-payload.ll b/llvmraytracing/test/dx/traversal-passthrough-payload.ll
index 6d75c1ba92..9224962e8c 100644
--- a/llvmraytracing/test/dx/traversal-passthrough-payload.ll
+++ b/llvmraytracing/test/dx/traversal-passthrough-payload.ll
@@ -10,7 +10,7 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 %struct.SystemData = type { %struct.DispatchSystemData, float }
 %struct.DispatchSystemData = type { i32 }
 
-!continuation.preservedPayloadRegisterCount = !{!8} ; PRESERVED_REGCOUNT
+!continuation.maxUsedPayloadRegisterCount = !{!8} ; PRESERVED_REGCOUNT
 
 declare !pointeetys !4 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
diff --git a/llvmraytracing/test/lgccps/entry-point-with-cps.ll b/llvmraytracing/test/lgccps/entry-point-with-cps.ll
index 02f18af2e8..c974db97f7 100644
--- a/llvmraytracing/test/lgccps/entry-point-with-cps.ll
+++ b/llvmraytracing/test/lgccps/entry-point-with-cps.ll
@@ -8,6 +8,11 @@
 
 declare void @lgc.cps.complete()
 
+define void @_cont_KernelEntry() #0 !lgc.rt.shaderstage !{i32 7} {
+  call void @lgc.cps.complete()
+  unreachable
+}
+
 define spir_func void @raygen({} %state, i32 %rcr) !lgc.shaderstage !{i32 7} !lgc.cps !{i32 0} {
   %pushconst = call ptr addrspace(4) @lgc.user.data(i32 0)
   %fn = load ptr, ptr addrspace(4) %pushconst
@@ -67,6 +72,9 @@ declare void @lgc.cps.await__isVoid(...)
 declare i32 @lgc.cps.await__i32(...)
 declare [2 x i32] @lgc.cps.await__a2i32(...)
 declare void @lgc.cps.jump(...)
+; CHECK-LABEL: define void @_cont_KernelEntry(
+; CHECK-NEXT:    ret void
+
 ; CHECK-LABEL: define spir_func void @raygen(
 ; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]]) !lgc.shaderstage [[META0:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
 ; CHECK-NEXT:  AllocaSpillBB:
diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll
index c0b0673fdb..279620cd4c 100644
--- a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll
+++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll
@@ -38,7 +38,7 @@ declare void @lgc.cps.jump(...) local_unnamed_addr
 
 !lgc.cps.module = !{}
 !continuation.maxPayloadRegisterCount = !{!11}
-!continuation.preservedPayloadRegisterCount = !{!12}
+!continuation.maxUsedPayloadRegisterCount = !{!12}
 
 !0 = !{i32 7}
 !1 = !{ { { i32 } } poison}
diff --git a/llvmraytracing/test/lgccps/lower-traversal.ll b/llvmraytracing/test/lgccps/lower-traversal.ll
index 0d6602dadf..cef8cab9d4 100644
--- a/llvmraytracing/test/lgccps/lower-traversal.ll
+++ b/llvmraytracing/test/lgccps/lower-traversal.ll
@@ -609,7 +609,7 @@ declare void @lgc.cps.jump(...) local_unnamed_addr
 declare ptr addrspace(7) @lgc.load.buffer.desc(i64 %0, i32 %1, i32 %2, i32 %3) local_unnamed_addr
 declare ptr @llvm.invariant.start.p7(i64 immarg %0, ptr addrspace(7) nocapture %1)
 
-!continuation.preservedPayloadRegisterCount = !{!7}
+!continuation.maxUsedPayloadRegisterCount = !{!7}
 !lgc.cps.module = !{}
 !lgc.rt.max.attribute.size = !{!4}
 
diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp
index 2a1ca060d5..19d92ea98e 100644
--- a/tool/dumper/vkgcPipelineDumper.cpp
+++ b/tool/dumper/vkgcPipelineDumper.cpp
@@ -712,6 +712,7 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo
   dumpFile << "options.forwardPropagateNoContract = " << shaderInfo->options.forwardPropagateNoContract << "\n";
   dumpFile << "options.constantBufferBindingOffset = " << shaderInfo->options.constantBufferBindingOffset << "\n";
   dumpFile << "options.imageSampleDrefReturnsRgba = " << shaderInfo->options.imageSampleDrefReturnsRgba << "\n";
+  dumpFile << "options.disableGlPositionOpt = " << shaderInfo->options.disableGlPositionOpt << "\n";
   dumpFile << "\n";
   // clang-format on
 }
@@ -960,6 +961,30 @@ void PipelineDumper::dumpPipelineOptions(const PipelineOptions *options, std::os
   dumpFile << glStatePrefix << "enableLineSmooth = " << options->getGlState().enableLineSmooth << "\n";
   dumpFile << glStatePrefix << "emulateWideLineStipple = " << options->getGlState().emulateWideLineStipple << "\n";
   dumpFile << glStatePrefix << "enablePointSmooth = " << options->getGlState().enablePointSmooth << "\n";
+
+  // Output compile time constant info
+  if (options->compileConstInfo) {
+    auto compileConstInfo = options->compileConstInfo;
+    dumpFile << "options.compileTimeConstants.numCompileTimeConstants = " << compileConstInfo->numCompileTimeConstants
+             << "\n";
+    for (unsigned i = 0; i < compileConstInfo->numCompileTimeConstants; ++i) {
+      dumpFile << "options.compileTimeConstants.constItem[" << i
+               << "].offset = " << compileConstInfo->pCompileTimeConstants[i].offset << "\n";
+      dumpFile << "options.compileTimeConstants.constItem[" << i
+               << "].set = " << compileConstInfo->pCompileTimeConstants[i].set << "\n";
+      dumpFile << "options.compileTimeConstants.constItem[" << i
+               << "].binding = " << compileConstInfo->pCompileTimeConstants[i].binding << "\n";
+      dumpFile << "options.compileTimeConstants.constItem[" << i
+               << "].validBytes = " << compileConstInfo->pCompileTimeConstants[i].validBytes << "\n";
+      dumpFile << "options.compileTimeConstants.constItem[" << i << "].values = ";
+      for (unsigned j = 0; j < compileConstInfo->pCompileTimeConstants[i].validBytes; ++j) {
+        dumpFile << compileConstInfo->pCompileTimeConstants[i].values.u32[j] << "";
+        if (j < compileConstInfo->pCompileTimeConstants[i].validBytes - 1)
+          dumpFile << ", ";
+      }
+      dumpFile << "\n";
+    }
+  }
 }
 
 // =====================================================================================================================
@@ -1060,6 +1085,7 @@ void PipelineDumper::dumpGraphicsStateInfo(const GraphicsPipelineBuildInfo *pipe
   dumpFile << "enableColorClampFs = " << pipelineInfo->glState.enableColorClampFs << "\n";
   dumpFile << "enableFlatShade = " << pipelineInfo->glState.enableFlatShade << "\n";
   dumpFile << "alphaTestFunc = " << pipelineInfo->glState.alphaTestFunc << "\n";
+  dumpFile << "enableInitialUndefVar = " << pipelineInfo->enableInitUndefZero << "\n";
 
   dumpFile << "originUpperLeft = " << pipelineInfo->getGlState().originUpperLeft << "\n";
   if (pipelineInfo->clientMetadataSize > 0) {
@@ -1551,6 +1577,8 @@ MetroHash::Hash PipelineDumper::generateHashForGraphicsPipeline(const GraphicsPi
   hasher.Update(pipeline->unlinked);
   hasher.Update(pipeline->enableEarlyCompile);
   hasher.Update(pipeline->dynamicTopology);
+  hasher.Update(pipeline->enableInitUndefZero);
+
   if (unlinkedShaderType == UnlinkedStageFragment && isCacheHash)
     hasher.Update(pipeline->enableColorExportShader);
   updateHashForPipelineOptions(&pipeline->options, &hasher, isCacheHash, unlinkedShaderType);
@@ -2003,6 +2031,7 @@ void PipelineDumper::updateHashForPipelineShaderInfo(ShaderStage stage, const Pi
       hasher->Update(options.backwardPropagateNoContract);
       hasher->Update(options.forwardPropagateNoContract);
       hasher->Update(options.imageSampleDrefReturnsRgba);
+      hasher->Update(options.disableGlPositionOpt);
     }
   }
 }
diff --git a/tool/vfx/vfx.h b/tool/vfx/vfx.h
index af2a234456..2e2696315b 100644
--- a/tool/vfx/vfx.h
+++ b/tool/vfx/vfx.h
@@ -156,9 +156,6 @@ class Float32 {
   // Constructor, initializes our VfxFloat32 with another VfxFloat32
   Float32(const Float32 &other) : m_bits(other.m_bits) {}
 
-  // Destructor
-  ~Float32() {}
-
   // Gets the numeric value
   float GetValue() const { return *reinterpret_cast<const float *>(&m_bits.u32All); }
 
diff --git a/tool/vfx/vfxParser.cpp b/tool/vfx/vfxParser.cpp
index a661e1d7de..a87ee705d5 100644
--- a/tool/vfx/vfxParser.cpp
+++ b/tool/vfx/vfxParser.cpp
@@ -54,6 +54,7 @@ namespace Vfx {
 
 // Parser functions to parse a value by it's type
 bool parseInt(char *str, unsigned lineNum, IUFValue *output);
+bool parseUint(char *str, unsigned lineNum, IUFValue *output);
 bool parseFloat(char *str, unsigned lineNum, IUFValue *output);
 bool parseFloat16(char *str, unsigned lineNum, IUFValue *output);
 bool parseDouble(char *str, unsigned lineNum, IUFValue *output);
@@ -434,6 +435,12 @@ bool Document::parseKeyValue(char *key, char *valueStr, unsigned lineNum, Sectio
           result = accessedSectionObject->set(lineNum, memberName, arrayIndex, &(value.iVec4[0]));
         break;
       }
+      case MemberTypeUint: {
+        result = parseUint(valueStr, lineNum, &value);
+        if (result)
+          result = accessedSectionObject->set(lineNum, memberName, arrayIndex, &(value.uVec4[0]));
+        break;
+      }
       case MemberTypeFloat16: {
         result = parseFloat16(valueStr, lineNum, &value);
         if (result)
@@ -708,6 +715,31 @@ bool parseInt(char *str, unsigned lineNum, IUFValue *output) {
   return result;
 }
 
+// =====================================================================================================================
+// Parses an unsigned int number from a string.
+//
+// @param str : Input string
+// @param lineNum : Current line number
+// @param [out] output : Stores parsed value
+bool parseUint(char *str, unsigned lineNum, IUFValue *output) {
+  VFX_ASSERT(output);
+  bool result = true;
+
+  bool isHex = false;
+  char *p0x = strstr(str, "0x");
+  if (p0x)
+    isHex = true;
+
+  output->uVec4[0] = strtoul(str, nullptr, 0);
+
+  output->props.isInt64 = false;
+  output->props.isFloat = false;
+  output->props.isDouble = false;
+  output->props.isHex = isHex;
+  output->props.length = 1;
+
+  return result;
+}
 // =====================================================================================================================
 // Parses a float number from a string.
 //
diff --git a/tool/vfx/vfxPipelineDoc.cpp b/tool/vfx/vfxPipelineDoc.cpp
index caad47db8e..6281b8690f 100644
--- a/tool/vfx/vfxPipelineDoc.cpp
+++ b/tool/vfx/vfxPipelineDoc.cpp
@@ -416,6 +416,8 @@ bool PipelineDocument::getPtrOfSubSection(Section *section, unsigned lineNum, co
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 73
     CASE_SUBSECTION(MemberTypeGlState, SectionGlState)
 #endif
+    CASE_SUBSECTION(MemberTypeCompileConstItem, SectionCompileConstItem)
+    CASE_SUBSECTION(MemberTypeCompileConstInfo, SectionCompileConstInfo)
   default:
     result = Document::getPtrOfSubSection(section, lineNum, memberName, memberType, isWriteAccess, arrayIndex, ptrOut,
                                           errorMsg);
diff --git a/tool/vfx/vfxSection.h b/tool/vfx/vfxSection.h
index 7a817eb03f..1ac6337469 100644
--- a/tool/vfx/vfxSection.h
+++ b/tool/vfx/vfxSection.h
@@ -83,6 +83,7 @@ enum SectionType : unsigned {
 // Enumerates VFX member type.
 enum MemberType : unsigned {
   MemberTypeInt,                          // VFX member type: 32 bit integer
+  MemberTypeUint,                         // VFX member type: 32 bit unsigned integer
   MemberTypeFloat,                        // VFX member type: 32 bit float
   MemberTypeFloat16,                      // VFX member type: 16 bit float
   MemberTypeDouble,                       // VFX member type: 64 bit double
@@ -116,6 +117,8 @@ enum MemberType : unsigned {
   MemberTypeSpecEntryItem,                // VFX member type: SectionSpecEntryItem
   MemberTypeResourceMappingNode,          // VFX member type: SectionResourceMappingNode
   MemberTypeSpecInfo,                     // VFX member type: SectionSpecInfo
+  MemberTypeCompileConstItem,             // VFX member type: SectionCompileConstItem
+  MemberTypeCompileConstInfo,             // VFX member type: SectionCompileConstInfo
   MemberTypeDescriptorRangeValue,         // VFX member type: SectionDescriptorRangeValueItem
   MemberTypePipelineOption,               // VFX member type: SectionPipelineOption
   MemberTypeShaderOption,                 // VFX member type: SectionShaderOption
@@ -356,7 +359,7 @@ struct StrToMemberAddrArrayRef {
 class Section {
 public:
   Section(StrToMemberAddrArrayRef addrTable, SectionType type, const char *sectionName);
-  virtual ~Section() {}
+  virtual ~Section() = default;
 
   static SectionType getSectionType(const char *sectionName);
   static void initSectionInfo();
@@ -876,6 +879,83 @@ class SectionVertexInput : public Section {
   std::vector<uint8_t> m_vbAddressLowBits;                             // Lowest two bits of vertex inputs offsets.
 };
 
+// =====================================================================================================================
+// Represents the sub section compile time constant map entry
+class SectionCompileConstItem : public Section {
+public:
+  typedef Vkgc::CompileTimeConst SubState;
+
+  SectionCompileConstItem() : Section(getAddrTable(), SectionTypeUnset, "constItem") {
+    memset(&m_state, 0, sizeof(m_state));
+  }
+
+  void getSubState(SubState &state) {
+    state = m_state;
+    state.values.u32[0] = m_values.iVec4[0];
+    state.values.u32[1] = m_values.iVec4[1];
+    state.values.u32[2] = m_values.iVec4[2];
+    state.values.u32[3] = m_values.iVec4[3];
+  };
+  SubState &getSubStateRef() { return m_state; };
+
+private:
+  static StrToMemberAddrArrayRef getAddrTable() {
+    static std::vector<StrToMemberAddr> addrTable = []() {
+      std::vector<StrToMemberAddr> addrTableInitializer;
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, offset, MemberTypeInt, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, set, MemberTypeUint, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, binding, MemberTypeInt, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, validBytes, MemberTypeInt, false);
+      INIT_MEMBER_NAME_TO_ADDR(SectionCompileConstItem, m_values, MemberTypeIVec4, false);
+      return addrTableInitializer;
+    }();
+    return {addrTable.data(), addrTable.size()};
+  }
+
+  SubState m_state;
+  IUFValue m_values = {};
+};
+
+// =====================================================================================================================
+// Represents the sub section compile time constant info
+class SectionCompileConstInfo : public Section {
+public:
+  typedef Vkgc::CompileConstInfo SubState;
+
+  SectionCompileConstInfo() : Section(getAddrTable(), SectionTypeUnset, "compileTimeConstants") {
+    memset(&m_state, 0, sizeof(m_state));
+  }
+
+  void getSubState(SubState &state) {
+    memset(&state, 0, sizeof(SubState));
+    if (m_constItem.size()) {
+      m_state.numCompileTimeConstants = static_cast<unsigned>(m_constItem.size());
+      m_compileConsts.resize(m_state.numCompileTimeConstants);
+      for (unsigned i = 0; i < m_compileConsts.size(); ++i)
+        m_constItem[i].getSubState(m_compileConsts[i]);
+      m_state.pCompileTimeConstants = &m_compileConsts[0];
+      state = m_state;
+    } else
+      memset(&m_state, 0, sizeof(SubState));
+  }
+  SubState &getSubStateRef() { return m_state; };
+
+private:
+  static StrToMemberAddrArrayRef getAddrTable() {
+    static std::vector<StrToMemberAddr> addrTable = []() {
+      std::vector<StrToMemberAddr> addrTableInitializer;
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionCompileConstInfo, numCompileTimeConstants, MemberTypeInt, false);
+      INIT_MEMBER_DYNARRAY_NAME_TO_ADDR(SectionCompileConstInfo, m_constItem, MemberTypeCompileConstItem, true);
+      return addrTableInitializer;
+    }();
+    return {addrTable.data(), addrTable.size()};
+  }
+
+  std::vector<SectionCompileConstItem> m_constItem;
+  std::vector<Vkgc::CompileTimeConst> m_compileConsts;
+  SubState m_state;
+};
+
 // =====================================================================================================================
 // Represents the sub section specialization constant map entry
 class SectionSpecEntryItem : public Section {
diff --git a/tool/vfx/vfxVkSection.h b/tool/vfx/vfxVkSection.h
index acdf016c59..f36c25ac15 100644
--- a/tool/vfx/vfxVkSection.h
+++ b/tool/vfx/vfxVkSection.h
@@ -257,6 +257,7 @@ class SectionShaderOption : public Section {
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, forwardPropagateNoContract, MemberTypeBool, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, constantBufferBindingOffset, MemberTypeInt, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, imageSampleDrefReturnsRgba, MemberTypeBool, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, disableGlPositionOpt, MemberTypeBool, false);
       return addrTableInitializer;
     }();
     return {addrTable.data(), addrTable.size()};
@@ -481,6 +482,9 @@ class SectionPipelineOption : public Section {
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 73
     m_glState.getSubState(m_state.glState);
 #endif
+    m_state.compileConstInfo = new Vkgc::CompileConstInfo();
+    m_compileTimeConstants.getSubState(*m_state.compileConstInfo);
+    state.compileConstInfo = m_state.compileConstInfo;
     state = m_state;
   };
   SubState &getSubStateRef() { return m_state; };
@@ -537,6 +541,7 @@ class SectionPipelineOption : public Section {
 #endif
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionPipelineOption, enablePrimGeneratedQuery, MemberTypeBool, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionPipelineOption, disablePerCompFetch, MemberTypeBool, false);
+      INIT_MEMBER_NAME_TO_ADDR(SectionPipelineOption, m_compileTimeConstants, MemberTypeCompileConstInfo, true);
       return addrTableInitializer;
     }();
     return {addrTable.data(), addrTable.size()};
@@ -544,6 +549,7 @@ class SectionPipelineOption : public Section {
 
   SubState m_state;
   SectionExtendedRobustness m_extendedRobustness;
+  SectionCompileConstInfo m_compileTimeConstants; // Compile time constant info
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 73
   SectionGlState m_glState;
 #endif
@@ -956,6 +962,7 @@ class SectionGraphicsState : public Section {
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, useSoftwareVertexBufferDescriptors, MemberTypeBool, false);
       INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_shaderLibrary, MemberTypeString, false);
       INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_rtState, MemberTypeRtState, true);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, enableInitUndefZero, MemberTypeBool, false);
 
       INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_clientMetadata, MemberTypeU8Array, false);
       INIT_MEMBER_ARRAY_NAME_TO_ADDR(SectionGraphicsState, m_uniformConstantMaps, MemberTypeUniformConstantMap,
diff --git a/util/gpurtshim/GpurtShim.cpp b/util/gpurtshim/GpurtShim.cpp
index 6e77b2c047..12f2265c8a 100644
--- a/util/gpurtshim/GpurtShim.cpp
+++ b/util/gpurtshim/GpurtShim.cpp
@@ -37,12 +37,6 @@
 
 using namespace Vkgc;
 
-void gpurt::getShaderLibrarySpirv(unsigned featureFlags, const void *&code, size_t &size) {
-  auto libCode = GpuRt::GetShaderLibraryCode(featureFlags);
-  code = libCode.pSpvCode;
-  size = libCode.spvSize;
-}
-
 RtIpVersion gpurt::getRtIpVersion(GfxIpVersion gfxIpVersion) {
   if (gfxIpVersion.major >= 11)
     return {2, 0};
@@ -71,6 +65,16 @@ static Pal::RayTracingIpLevel getRtIpLevel(RtIpVersion rtIpVersion) {
   abort();
 }
 
+void gpurt::getShaderLibrarySpirv(RtIpVersion rtIpVersion, unsigned featureFlags, const void *&code, size_t &size) {
+#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 48
+  auto libCode = GpuRt::GetShaderLibraryCode(featureFlags);
+#else
+  auto libCode = GpuRt::GetShaderLibraryCode(getRtIpLevel(rtIpVersion), featureFlags);
+#endif
+  code = libCode.pSpvCode;
+  size = libCode.spvSize;
+}
+
 static void unmangleDxilName(char *dst, const char *src) {
   // input  "\01?RayQueryProceed1_1@@YA_NURayQueryInternal@@IV?$vector@I$02@@@Z"
   // output "RayQueryProceed1_1"
diff --git a/version/include/llpcVersion.h.in b/version/include/llpcVersion.h.in
index 48aec80a2d..773df94485 100644
--- a/version/include/llpcVersion.h.in
+++ b/version/include/llpcVersion.h.in
@@ -37,6 +37,9 @@
 //  %Version History
 //  | %Version | Change Description                                                                                    |
 //  | -------- | ----------------------------------------------------------------------------------------------------- |
+//  |     75.4 | Add disableGlPositionOpt to PipelineShaderOptions.                                                    |
+//  |     75.3 | Add enableInitUndefZero to GraphicPipelineBuildInfo                                                   |
+//  |     75.2 | Add CompileConstInfo to PipelineShaderOptions.                                                        |
 //  |     75.1 | Add alphaFunc to GraphicPipelineBuildInfo.                                                            |
 //  |     75.0 | BuildRayTracingPipeline now will not generate kernel entry for pipeline library anymore.              |
 //  |     74.2 | Add enableMapClipDistMask to GraphicsPipelineBuildInfo.                                               |
@@ -193,7 +196,7 @@
 #define LLPC_INTERFACE_MAJOR_VERSION 75
 
 /// LLPC minor interface version.
-#define LLPC_INTERFACE_MINOR_VERSION 0
+#define LLPC_INTERFACE_MINOR_VERSION 4
 
 /// The client's LLPC major interface version
 #ifndef LLPC_CLIENT_INTERFACE_MAJOR_VERSION